/*********************************************************/
/* TAUCS                                                 */
/* Author: Sivan Toledo                                  */
/*                                                       */
/*********************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <math.h>
#include <assert.h>


/*
  extern int EF_ALIGNMENT;
  extern int EF_PROTECT_BELOW;
  extern int EF_PROTECT_FREE;
  extern int EF_ALLOW_MALLOC_0;
  extern int EF_FILL;
*/



#include "taucs.h"

#define HEADER_NROWS   0
#define HEADER_NCOLS   1
#define HEADER_FLAGS   2
#define HEADER_COLPERM 3
#define HEADER_IPIVOTS 4
#define HEADER_LCLEN   5
#define HEADER_UCLEN   6
#define HEADER 7

#ifndef TAUCS_CORE_GENERAL

#ifdef OSTYPE_win32
#include <io.h>
#else
#include <unistd.h>
#include <sys/uio.h>
#endif

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>

#define iabs(x) ((x) > 0 ? (x) : (-(x)))

static double remaining_memory;

/*********************************************************/
/* NEW IO ROUTINES                                       */
/*********************************************************/

/*
  Layout of data structures in the file:
  m       (number of rows)
  n       (number of cols)
  flags
  colperm
  ipivots
  Lclen      (nonzeros per column in L)
  Lclen      (nonzeros per column in U)
  Lrowind[0]
  Lvalues[0]
  Urowind[0]
  Uvalues[0]
  Lrowind[1]
  Lvalues[1]
  Urowind[1]
  Uvalues[1]
  ...
*/

static
int Lappendcol(taucs_io_handle* LU, int j, int nnz, int* ind, taucs_datatype* re)
{
  if (taucs_io_append(LU,
		       0 + HEADER+j*4,
		       nnz,1,
		       TAUCS_INT,
		       ind)) return TAUCS_ERROR_IO;
  if (taucs_io_append(LU,
		      1 + HEADER+j*4,
		      nnz,1,
		      TAUCS_CORE_DATATYPE,
		      re)) return TAUCS_ERROR_IO;
  return TAUCS_SUCCESS;
}

static
int Uappendcol(taucs_io_handle* LU, int j, int nnz, int* ind, taucs_datatype* re)
{
  if (taucs_io_append(LU,
		       2 + HEADER+j*4,
		       nnz,1,
		       TAUCS_INT,
		       ind)) return TAUCS_ERROR_IO;
  if (taucs_io_append(LU,
		       3 + HEADER+j*4,
		       nnz,1,
		       TAUCS_CORE_DATATYPE,
		       re)) return TAUCS_ERROR_IO;
  return TAUCS_SUCCESS;
}

static
int Lreadcol(taucs_io_handle* LU, int j, int nnz, int* ind, taucs_datatype* re)
{
  if (taucs_io_read(LU,
		     0 + HEADER+j*4,
		     nnz,1,
		     TAUCS_INT,
		     ind)) return TAUCS_ERROR_IO;
  if (taucs_io_read(LU,
		     1 + HEADER+j*4,
		     nnz,1,
		     TAUCS_CORE_DATATYPE,
		     re)) return TAUCS_ERROR_IO;
  return TAUCS_SUCCESS;
}

static
int Ureadcol(taucs_io_handle* LU, int j, int nnz, int* ind, taucs_datatype* re)
{
  if (taucs_io_read(LU,
		     2 + HEADER+j*4,
		     nnz,1,
		     TAUCS_INT,
		     ind)) return TAUCS_ERROR_IO;
  if (taucs_io_read(LU,
		     3 + HEADER+j*4,
		     nnz,1,
		     TAUCS_CORE_DATATYPE,
		     re)) return TAUCS_ERROR_IO;
  return TAUCS_SUCCESS;
}

/*********************************************************/
/* ANALYSIS PHASE                                        */
/*********************************************************/

/* BUFFER SIZES AND MEMORY MANAGEMENT */

static int get_iobufsize()
{
  return 65536; /* minimal size of I/O buffers for good performance */
}

/* COLUMN-HEAP OPERATIONS */

/* The last entry in each input run must be extracted last among
   all the entries from that run, otherwise the heap might
   overflow */

static int heap_compare(int* heap, int i, int j) 
{
  /* compare columns */

  if (heap[3*i+1] < heap[3*j+1]) return -1;
  if (heap[3*i+1] > heap[3*j+1]) return  1;

  /* same column, compare runs (and order within runs) */

  if (heap[3*i+2] < heap[3*j+2]) return -1;
  if (heap[3*i+2] > heap[3*j+2]) return  1;

  return 0;
}

static void heap_heapify(int* heap, int* heapsize,int p) 
{
  int r,l,smallest;
  int i,j,k;

  l = p * 2;
  r = l + 1;

  if (l != 2*p || r != 2*p+1)
    taucs_printf("heap: left-right computation failed\n");

  if (l < *heapsize && heap_compare(heap,l,p) < 0)
    smallest = l;
  else
    smallest = p;

  if (r < *heapsize && heap_compare(heap,r,smallest) < 0)
    smallest = r;
  
  if (smallest != p) {
    i = heap[3*p]; 
    j = heap[3*p+1]; 
    k = heap[3*p+2]; 

    heap[3*p]   = heap[3*smallest];
    heap[3*p+1] = heap[3*smallest+1];
    heap[3*p+2] = heap[3*smallest+2];

    heap[3*smallest]   = i;
    heap[3*smallest+1] = j;
    heap[3*smallest+2] = k;
    
    heap_heapify(heap,heapsize,smallest);
  }
}

#if 0
static void x_heap_insert(int* heap, int* heapsize, int i, int j, int k)
{
  int l = *heapsize;

  heap[3*l]   = i;
  heap[3*l+1] = j;
  heap[3*l+2] = k;

  (*heapsize)++;
}  

static void x_heap_extract_min(int* heap, int* heapsize, int* i, int* j, int* k) 
{
  int l,m,mv;

  if (*heapsize <= 0)
    taucs_printf("heap: Trying to extract from an expty heap\n");

  mv = INT_MAX;
  for (l=0; l<(*heapsize); l++) {
    if (heap[3*l+1] < mv) {
      m = l;
      mv = heap[3*l+1];
    }
  }
  
  *i = heap[3*m];
  *j = heap[3*m+1];
  *k = heap[3*m+2];

  (*heapsize)--;
  for (l=m; l<(*heapsize); l++) {
    heap[3*l]   = heap[3*(l+1)];
    heap[3*l+1] = heap[3*(l+1) + 1];
    heap[3*l+2] = heap[3*(l+1) + 2];
  }
}
#endif

static void heap_insert(int* heap, int* heapsize, int i, int j, int k)
{
  int child, parent;

  (*heapsize)++;

  child = (*heapsize-1);
  parent = child / 2;
  while (child > 0 && 
	 ((heap[3*parent+1] > j) || 
	  ((heap[3*parent+1] == j) && (heap[3*parent+2] > k)))) {
    heap[3*child]   = heap[3*parent];
    heap[3*child+1] = heap[3*parent+1];
    heap[3*child+2] = heap[3*parent+2];
    child = parent;
    parent = child / 2;
  }

  heap[3*child]   = i;
  heap[3*child+1] = j;
  heap[3*child+2] = k;
}

static void heap_extract_min(int* heap, int* heapsize, int* i, int* j, int* k) 
{
  if (*heapsize <= 0)
    taucs_printf("heap: Trying to extract from an expty heap\n");
  
  *i = heap[0];
  *j = heap[1];
  *k = heap[2];

  heap[0] = heap[3 * ((*heapsize)-1)];
  heap[1] = heap[3 * ((*heapsize)-1) + 1];
  heap[2] = heap[3 * ((*heapsize)-1) + 2];

  (*heapsize) --;

  heap_heapify(heap,heapsize,0);
}

/* SYMMETRIC SKELETON GRAPH OPERATIONS */

static char skel_basename[256];
static int* skel_buffer = NULL;
static int  skel_buffer_size = -1; /* counted in PAIRS of integers */
static int  skel_buffer_ptr  = -1; /* index into PAIRS of integers */
static int  skel_outfiles    = -1;
static int  skel_infiles     = -1;
static int  skel_outfile     = -1;

static char skel_inphase     = 0;
static char skel_outphase    = 0;

static int skel_get_lastcol  = -1;
static int skel_next         = -1;

static int skel_compare(const void* e1, const void* e2) 
{
  /* we sort according to columns */
  int j1, j2;
  const int* jp1;
  const int* jp2;

  jp1 = (const int*) e1;
  jp2 = (const int*) e2;

  j1 = jp1[1];
  j2 = jp2[1];
  if (j1 < j2) return -1;
  if (j1 > j2) return 1;

  /* same column, compare rows */

  j1 = jp1[0];
  j2 = jp2[0];
  if (j1 < j2) return -1;
  if (j1 > j2) return 1;
  
  return 0;
}

static int skel_init(char* basename)
{
  sprintf(skel_basename,"%s.ssort",basename);

  /* adjust remaining memory.
     We allocate 2 io buffers for the sort phase, plus an array
     of file pointers that is guarantreed to be smaller.
     We then free them and allocate another io buffer for the
     stack phase, but we do not free the skel buffer first,
     so 3 io buffers is a conservative estimate. */

  remaining_memory -= (double) (3*get_iobufsize());

  /* debugging */
  skel_buffer_size = (int)(remaining_memory) / (2*sizeof(int));

  /*EF_FILL=0x00;*/
  skel_buffer      = (int*)taucs_malloc(skel_buffer_size * 2*sizeof(int));
  if (!skel_buffer) return TAUCS_ERROR_NOMEM;


  skel_buffer_ptr  = 0;
  skel_outfiles    = 0;

  skel_outphase = 'e';
  skel_inphase  = 'o';

  skel_get_lastcol = -1;
  skel_next = 0;
  skel_outfile = -1;

  return TAUCS_SUCCESS;
}

static void skel_finalize()
{
  int i;

  /*
     this is a bug! somehow, we are using this memory after it is
     freed, so if we fill with ff we fail, if we fill with 0 we
     succeed. But I was not able to detect where the problem is,
     even with Electric Fence 
  */
  for (i=0; i<2*skel_buffer_size; i++) skel_buffer[i]=0xffffffff;
  for (i=0; i<2*skel_buffer_size; i++) skel_buffer[i]=0x0;

  taucs_free(skel_buffer); 
  skel_buffer = NULL;

  remaining_memory += (double) (3*get_iobufsize());
}

static int skel_add(int i,int j) 
{
  if (skel_buffer_ptr < skel_buffer_size) {
    /*
    assert(skel_buffer[2*skel_buffer_ptr  ]==0xcccccccc);
    assert(skel_buffer[2*skel_buffer_ptr+1]==0xcccccccc);
    */
    skel_buffer[2*skel_buffer_ptr]   = i;
    skel_buffer[2*skel_buffer_ptr+1] = j;
    skel_buffer_ptr++;

    return TAUCS_SUCCESS;
  } else {
    int     file;
    ssize_t io_size;
    char    fname[256];
    mode_t mode;
    mode_t perm;

    assert(0); /* for testing xxx */

    /* SORT THIS BUFFER */

    qsort(skel_buffer, skel_buffer_ptr, 2*sizeof(int), &skel_compare);

    /* WRITE OUT */
    sprintf(fname,"%s.%c.%d",skel_basename,skel_outphase,skel_outfiles);
    taucs_printf("oocsp_colanalyze: Writing out skel sort buffer <%s> (3)\n",fname);

#ifdef OSTYPE_win32
    mode = _O_WRONLY | _O_CREAT | _O_BINARY;
    perm = _S_IREAD | _S_IWRITE | _S_IEXEC;
#else
    mode = O_WRONLY | O_CREAT;
    perm = 0644;
#endif

    file = open(fname,mode,perm);
    if (file == -1) {
      taucs_printf("oocsp_colanalyze: could not create skel sort file\n");
      return TAUCS_ERROR_IO;
    }
    io_size = write(file,skel_buffer,skel_buffer_ptr * 2 * sizeof(int));
    if (io_size != skel_buffer_ptr * 2 * sizeof(int)) {
      taucs_printf("oocsp_colanalyze: write to skel sort file failed\n");
      unlink(fname);
      return TAUCS_ERROR_IO;
    }
    close(file);

    skel_outfiles++;
    skel_buffer_ptr = 0;

    taucs_printf("oocsp_colanalyze: done (using file %s)\n",fname);
    return TAUCS_SUCCESS;
  }
}

static void skel_sort_incore(int* postorder, int ncols, int* inv_postorder)
{
  int natural,i,col;

  natural = 1;
  for (i=0; i<ncols; i++) {
    if (postorder[i] != i) {
      natural = 0;
      break;
    }
  }

  if (natural == 0) {
    for (i=0; i<ncols; i++)
      inv_postorder[postorder[i]] = i;

    for (i=0; i<skel_buffer_ptr; i++) {
      col = skel_buffer[2*i+1];
      skel_buffer[ 2*i+1 ] = inv_postorder[col];
    }
  }

  qsort(skel_buffer, skel_buffer_ptr, 2*sizeof(int),
	&skel_compare);
}

static int skel_sort_outofcore(int* postorder, int ncols, int* inv_postorder)
{
  int  errorcode = TAUCS_SUCCESS;
  int  natural,i,j,k,e,f;
  int  heapsize, iobufsize, openruns, maxopenruns, runstart;
  int* infiles = NULL;
  int  outfile;

  int *inbuf  = NULL;
  int *outbuf = NULL;
  int outbuf_ptr;

  int     file;
  ssize_t io_size;
  ssize_t io_count;
  char    fname[256];

  int last_col, last_extracted;
  mode_t mode;
  mode_t perm;

  assert(0); /* debugging xxx */

  /* FIRST, WRITE OUT THIS BUFFER */
  if (skel_buffer_ptr > 0) {
    /* SORT THIS BUFFER */

    qsort(skel_buffer, skel_buffer_ptr, 2*sizeof(int), &skel_compare);

    /* WRITE OUT */
    sprintf(fname,"%s.%c.%d",skel_basename,skel_outphase,skel_outfiles);
    taucs_printf("oocsp_colanalyze: Writing out skel sort buffer <%s> (1)\n",fname);
#ifdef OSTYPE_win32
    mode = _O_WRONLY | _O_CREAT | _O_BINARY;
    perm = _S_IREAD | _S_IWRITE | _S_IEXEC;
#else
    mode = O_WRONLY | O_CREAT;
    perm = 0644;
#endif
    file = open(fname,mode,perm);
    if (file == -1) {
      taucs_printf("oocsp_colanalyze: could not create skel sort file\n");
      errorcode = TAUCS_ERROR_IO;
      goto end;
    }
    io_size = write(file,skel_buffer,skel_buffer_ptr * 2 * sizeof(int));
    if (io_size != skel_buffer_ptr * 2 * sizeof(int)) {
      taucs_printf("oocsp_colanalyze: write to skel sort file failed\n");
      unlink(fname);
      errorcode = TAUCS_ERROR_IO;
      goto end;
    }
    close(file);

    skel_outfiles++;
    skel_buffer_ptr = 0;
  }

  /* DO WE NEED TO SORT THE RUNS AGAIN IN POSTORDER? */
  natural = 1;
  for (i=0; i<ncols; i++) {
    if (postorder[i] != i) {
      natural = 0;
      break;
    }
  }

  if (natural == 0) {
    for (i=0; i<ncols; i++)
      inv_postorder[postorder[i]] = i;

    for (f=0; f<skel_outfiles; f++) {
      sprintf(fname,"%s.%c.%d",skel_basename,skel_outphase,f);
      taucs_printf("oocsp_colanalyze: Resorting skel sort file <%s>\n",fname);
#ifdef OSTYPE_win32
      mode = _O_RDWR | _O_BINARY;
#else
      mode = O_RDWR;
#endif
      file = open(fname,mode);
      if (file == -1) {
	taucs_printf("oocsp_colanalyze: could not open skel sort file\n");
	errorcode = TAUCS_ERROR_IO;
	goto end;
      }
      /* read the run */
      io_size = read(file,skel_buffer,skel_buffer_size*2*sizeof(int));
      if (io_size == -1) {
	taucs_printf("oocsp_colanalyze: read from skel sort file failed\n");
	errorcode = TAUCS_ERROR_IO;
	goto end;
      }
      io_count = io_size/(2*sizeof(int));
      close(file);

      /* sort again */

      assert((int) io_count <= skel_buffer_size);
      for (i=0; i<(int)io_count; i++) {
	j = skel_buffer[2*i+1];
	skel_buffer[ 2*i+1 ] = inv_postorder[j];
      }

      qsort(skel_buffer, io_count, 2*sizeof(int),
	    &skel_compare);

      /* rewind the file and write back */
#ifdef OSTYPE_win32
      mode = _O_RDWR | _O_BINARY;
#else
      mode = O_RDWR;
#endif
      file = open(fname,mode);
      if (file == -1) {
	taucs_printf("oocsp_colanalyze: could not open skel sort file\n");
	errorcode = TAUCS_ERROR_IO;
	goto end;
      }
      /*      lseek(file,0,SEEK_SET);*/
      io_size = write(file,skel_buffer,io_count * 2 * sizeof(int));
      if (io_size != io_count * 2 * sizeof(int)) {
	taucs_printf("oocsp_colanalyze: write to skel sort file failed\n");
	unlink(fname);
	errorcode = TAUCS_ERROR_IO;
	goto end;
      }
      close(file);
    }
  }

  /* WE NOW USE THE BUFFER AS A HEAP */
  
  heapsize = 0;

  /* in blocks of 2*sizeof(int), not bytes! */
  iobufsize   = get_iobufsize() / (2*sizeof(int)); 

  /* 
     each element in the skel_buffer is 2 ints, but 
     each element in the heap is 3 ints 
  */
  maxopenruns = (2*skel_buffer_size) / (3*iobufsize);
  if (maxopenruns < 2) {
    maxopenruns = 2;
    iobufsize   = (2*skel_buffer_size) / (3 * 2);
  }
  taucs_printf("oocsp_colanalyze: Using io buffers of %d elements (%d bytes), max runs = %d\n",
	     iobufsize,iobufsize*2,maxopenruns);

  inbuf  = (int*)taucs_malloc(iobufsize*2*sizeof(int));
  outbuf = (int*)taucs_malloc(iobufsize*2*sizeof(int));
  infiles  = (int*)taucs_malloc(skel_outfiles*sizeof(int));
  /*
  inbuf  = taucs_calloc(iobufsize,2*sizeof(int));
  outbuf = taucs_calloc(iobufsize,2*sizeof(int));
  infiles  = taucs_calloc(skel_outfiles,sizeof(int));
  */
  
  if (!inbuf || !outbuf || !infiles) {
    errorcode = TAUCS_ERROR_NOMEM;
    goto end;
  }

  while (skel_outfiles > 1) {
    char phase;
    /*    int  i,j,k,runstart,openruns;*/

    taucs_printf("oocsp_colanalyze: Starting another merge phase with %d input runs\n",
	       skel_outfiles);

    skel_infiles  = skel_outfiles;
    skel_outfiles = 0;

    phase         = skel_inphase;
    skel_inphase  = skel_outphase;
    skel_outphase = phase;

    for (runstart=0; runstart<skel_infiles; runstart += maxopenruns) {

      sprintf(fname,"%s.%c.%d",
	      skel_basename,skel_outphase,runstart/maxopenruns);
      taucs_printf("oocsp_colanalyze: Opening output run <%s>\n",fname);
#ifdef OSTYPE_win32
      mode = _O_WRONLY | _O_CREAT | _O_BINARY;
      perm = _S_IREAD | _S_IWRITE | _S_IEXEC;
#else
      mode = O_WRONLY | O_CREAT;
      perm = 0644;
#endif
      outfile = open(fname,mode,perm);
      if (outfile == -1) {
	taucs_printf("oocsp_colanalyze: could not open skel sort output file\n");
	errorcode = TAUCS_ERROR_IO;
	goto end;
      }
      skel_outfiles++;

      for (openruns=0; 
	   openruns < maxopenruns && runstart+openruns < skel_infiles;
	   openruns++) {
	
	sprintf(fname,"%s.%c.%d",skel_basename,skel_inphase,runstart+openruns);
#ifdef OSTYPE_win32
	mode = _O_RDONLY | _O_BINARY;
#else
	mode = O_RDONLY;
#endif
	infiles[openruns] = open(fname,mode);
	taucs_printf("oocsp_colanalyze: Opening input run <%s> (%d)\n",fname,infiles[openruns]);
	if (infiles[openruns] == -1) {
	  taucs_printf("oocsp_colanalyze: could not open skel sort input file\n");
	  errorcode = TAUCS_ERROR_IO;
	  goto end;
	}
	io_size = read(infiles[openruns],inbuf,iobufsize*2*sizeof(int));
	if (io_size == -1) {
	  taucs_printf("oocsp_colanalyze: read from skel sort file failed\n");
	  errorcode = TAUCS_ERROR_IO;
	  unlink(fname);
	  goto end;
	}
	io_count = io_size/(2*sizeof(int));
	if (io_count == 0) {
	  close(infiles[openruns]);
	  unlink(fname);
	}

	taucs_printf("oocsp_colanalyze: files %d %d\n",infiles[0],infiles[1]);

	taucs_printf("oocsp_colanalyze: Inserting %d elements from input run into heap\n",io_count);
	last_col = -1;
	for (e=0; e<(int)io_count; e++) {
	  i = inbuf[2*e];
	  j = inbuf[2*e+1];
	  /*printf("ij = %d\t%d\n",i,j);*/
	  if (last_col > j) {
	    taucs_printf("oocsp_colanalyze: > last = %d col = %d, (%d %d)\n",last_col,j,
		      e,io_count);
	    taucs_printf("oocsp_colanalyze: input run not sorted!\n");
	  }
	  last_col = j;
	  if (e == (int)io_count-1) /* end of inbuf marker */
	    heap_insert(skel_buffer,&heapsize,i,j,2*openruns+1);
	  else
	    heap_insert(skel_buffer,&heapsize,i,j,2*openruns);
	}
      } 

      taucs_printf("oocsp_colanalyze: files %d %d\n",infiles[0],infiles[1]);

      taucs_printf("oocsp_colanalyze: heapsize = %d\n",heapsize);

      outbuf_ptr = 0;

      last_extracted = -1;
      while (heapsize > 0) {
	int end_of_run,run;

	heap_extract_min(skel_buffer,&heapsize,&i,&j,&k);
	if (last_extracted > j)
	  taucs_printf("oocsp_colanalyze: heap order error!\n");
	last_extracted = j;
	outbuf[2*outbuf_ptr]   = i;
	outbuf[2*outbuf_ptr+1] = j;
	if (k % 2 == 1) end_of_run = 1;
	else end_of_run = 0;
	run = k / 2;
	outbuf_ptr ++;
	/*
	printf("extracted (%d,%d,%d) heapsize = %d\n",i,j,k,heapsize);
	*/


	if (end_of_run) {
	  taucs_printf("oocsp_colanalyze: Reading more from run %d\n",run);
	  io_size = read(infiles[run],inbuf,iobufsize * 2 * sizeof(int));
	  if (io_size == -1) {
	    taucs_printf("oocsp_colanalyze: errno = %d (%d)\n",errno,infiles[run]);
	    taucs_printf("oocsp_colanalyze: read from skel sort file failed\n");
	    errorcode = TAUCS_ERROR_IO;
	    goto end;
	  }
	  io_count = io_size/(2*sizeof(int));
	  if (io_count == 0) {
	    sprintf(fname,"%s.%c.%d",skel_basename,skel_inphase,runstart+run);
	    taucs_printf("oocsp_colanalyze: Closing input run %d <%s>\n",run,fname);
	    close(infiles[run]);
	    unlink(fname);
	  }

	  taucs_printf("oocsp_colanalyze: Extracted %d,%d,%d\n",i,j,k);
	  taucs_printf("oocsp_colanalyze: heapsize = %d\n",heapsize);
	  /*
	  { 
	    int ii; 
	    printf("heap: \n");
	    for (ii=0; ii<heapsize; ii++) printf("(%d,%d,%d) ",
						    skel_buffer[3*ii+0],
						    skel_buffer[3*ii+1],
						    skel_buffer[3*ii+2]);
	    printf("\n");
	  }
	  */
	  taucs_printf("oocsp_colanalyze: Inserting %d elements from input run into heap\n",io_count);

	  last_col = -1;
	  for (e=0; e<(int)io_count; e++) {
	    i = inbuf[2*e];
	    j = inbuf[2*e+1];

	    if (last_col > j) 
	      taucs_printf("oocsp_colanalyze: input run not sorted!\n");
	    last_col = j;

	    if (e == (int)io_count-1) /* end of inbuf marker */
	      heap_insert(skel_buffer,&heapsize,i,j,2*run+1);
	    else
	      heap_insert(skel_buffer,&heapsize,i,j,2*run);

	    if (3*heapsize >= 2*skel_buffer_size) {
	      taucs_printf("oocsp_colanalyze: heapsize = %d, buffer_size = %d\n",
			heapsize,2*skel_buffer_size);
	      taucs_printf("oocsp_colanalyze: merge-heap overflow\n");
	    }
	  }

	  taucs_printf("oocsp_colanalyze: heapsize = %d\n",heapsize);
	}

	if (outbuf_ptr >= iobufsize) {
	  taucs_printf("oocsp_colanalyze: Writing to output run\n");
	  taucs_printf("oocsp_colanalyze: heapsize = %d\n",heapsize);
	  io_count = iobufsize;
	  io_size = write(outfile,outbuf,io_count * 2 * sizeof(int));
	  if (io_size != io_count * 2 * sizeof(int)) {
	    taucs_printf("oocsp_colanalyze: write to skel sort file failed\n");
	    errorcode = TAUCS_ERROR_IO;
	    goto end;
	  }
	  outbuf_ptr = 0;
	}
      }

      /* write the rest of the output */
      if (outbuf_ptr > 0) {
	taucs_printf("oocsp_colanalyze: Writing to output run and closing\n");
	io_count = outbuf_ptr;
	io_size = write(outfile,outbuf,io_count * 2 * sizeof(int));
	if (io_size != io_count * 2 * sizeof(int)) {
	  taucs_printf("oocsp_colanalyze: write to skel sort file failed\n");
	  errorcode = TAUCS_ERROR_IO;
	  goto end;
	}
	outbuf_ptr = 0;
      }
      close(outfile);
      skel_buffer_ptr = 0;
    }
  }

 end:

  taucs_free(infiles);
  taucs_free(inbuf);
  taucs_free(outbuf);

  /*
  taucs_free(infiles);
  taucs_free(inbuf);
  taucs_free(outbuf);
  */

  return errorcode;
}

static int  stack_allocated = 0; /* maybe 1 would be better here, causing a fault
                                    instead of a memory leak in case of a bug */
static int  stack_buffer_size = -1;
static int* stack_buffer      = NULL;

static int skel_sort(int* postorder, int ncols,int* tmp)
{
  int  errorcode = TAUCS_SUCCESS;
  int  file;
  char fname[256];
  ssize_t io_size;
  int iobufsize;
  mode_t mode;
  mode_t perm;

  iobufsize   = get_iobufsize() / (2*sizeof(int)); 

  if (skel_outfiles == 0) {
    skel_sort_incore(postorder,ncols,tmp);

    /* xxxyyy */
    if (skel_buffer_ptr <= skel_buffer_size / 2) {
      taucs_printf("final skel ptr=%d\n",skel_buffer_ptr);
      stack_buffer      = skel_buffer + (2*skel_buffer_ptr);
      stack_buffer_size = 2*(skel_buffer_size - skel_buffer_ptr);

      stack_buffer      = taucs_malloc(stack_buffer_size * sizeof(int)); /* debugging */

      stack_allocated   = 0;
      taucs_printf("oocsp_colanalyze: Using remainder of skeleton buffer for stack,\n");
      taucs_printf("oocsp_colanalyze: size = %d ints\n",stack_buffer_size);
    } else {
      assert(0); /* for debugging xxx */
      sprintf(fname,"%s.%c.%d",skel_basename,skel_outphase,0);
      taucs_printf("oocsp_colanalyze: Writing out skel sort buffer <%s> (2)\n",fname);
#ifdef OSTYPE_win32
      mode = _O_WRONLY | _O_CREAT | _O_BINARY;
      perm = _S_IREAD | _S_IWRITE | _S_IEXEC;
#else
      mode = O_WRONLY | O_CREAT;
      perm = 0644;
#endif
      file = open(fname,mode,perm);
      if (file == -1) {
	taucs_printf("oocsp_colanalyze: could not create skel sort file\n");
	return TAUCS_ERROR_IO;
      }
      io_size = write(file,skel_buffer,skel_buffer_ptr * 2 * sizeof(int));
      if (io_size != skel_buffer_ptr * 2 * sizeof(int)) {
	taucs_printf("oocsp_colanalyze: write to skel sort file failed\n");
	unlink(fname);
	return TAUCS_ERROR_IO;
      }

      close(file);

      skel_outfiles++;
      skel_buffer_ptr = 0;

      stack_buffer      = skel_buffer;
      stack_buffer_size = 2*skel_buffer_size;
      stack_allocated   = 1; /* we need to free it */

      skel_buffer_size  = iobufsize;
      skel_buffer       = (int*)taucs_malloc(iobufsize*2*sizeof(int));
      if (!skel_buffer) return TAUCS_ERROR_NOMEM;
      taucs_printf("oocsp_colanalyze: Using skeleton buffer for stack, allocating \n");
      taucs_printf("oocsp_colanalyze: new skeleton buffer\n\n");
    }
  } else {
    assert(0); /* debugging xxx */
    if ((errorcode=skel_sort_outofcore(postorder,ncols,tmp)) != TAUCS_SUCCESS)
      return errorcode;

    stack_buffer      = skel_buffer;
    stack_buffer_size = 2*skel_buffer_size;
    skel_buffer_size  = iobufsize;
    skel_buffer       = (int*)taucs_malloc(iobufsize*2*sizeof(int));
    if (!skel_buffer) return TAUCS_ERROR_NOMEM;
    taucs_printf("oocsp_colanalyze: Using skeleton buffer for stack, allocating \n");
    taucs_printf("oocsp_colanalyze: new skeleton buffer\n");
  }

  return errorcode;
}

static int skel_get_next(int j)
{
  int row, col;
  char fname[256];
  ssize_t io_size;
  mode_t mode;

  if (skel_next >= skel_buffer_ptr) {
    if (skel_outfiles > 0) {
      if (skel_outfile == -1) {
	sprintf(fname,"%s.%c.%d",skel_basename,skel_outphase,0);
	taucs_printf("oocsp_colanalyze: Opening skel sort buffer <%s> (2)\n",fname);
#ifdef OSTYPE_win32
	mode = _O_RDONLY | _O_BINARY;
#else
	mode = O_RDONLY;
#endif
	skel_outfile = open(fname,mode);
	if (skel_outfile == -1) {
	  taucs_printf("oocsp_colanalyze: could not open skel sort file\n");
	  return TAUCS_ERROR_IO;
	}
      }
      io_size = read(skel_outfile,
		     skel_buffer,skel_buffer_size * 2 * sizeof(int));
      if (io_size == -1) {
	taucs_printf("oocsp_colanalyze: I/O error while trying to read skel sort file\n");
	return TAUCS_ERROR_IO;
      }
      if (io_size == 0) { /* end of file */
	taucs_printf("oocsp_colanalyze: Closing and removing skel file, col=%d\n",j);
	sprintf(fname,"%s.%c.%d",skel_basename,skel_outphase,0);
	close(skel_outfile);
	unlink(fname);
	skel_outfiles   = 0;
	skel_next       = 0;
	skel_buffer_ptr = 0;
	return -1;
      } else {
	skel_next       = 0;
	skel_buffer_ptr = io_size / (2*sizeof(int));
	taucs_printf("oocsp_colanalyze: read %d elements from skel sort file\n",skel_buffer_ptr);
      }
    } else
      return -1;
  }

  if (skel_next >= skel_buffer_ptr)
    return -1;

  assert(skel_next >= 0);
  assert(skel_next < skel_buffer_size);

  row = skel_buffer[2*skel_next];
  col = skel_buffer[2*skel_next+1];

  if (col == j) {
    skel_next++;
    return row;
  }
  else
    return -1;
}
    

static int skel_get_postordercol(int* found, int flag,
				  int j,
				  int* nnz, int* rowind)
{
  int row;

  *nnz = 0;
  while ((row = skel_get_next(j)) >= 0) {
    if (found[row] < flag) {
      found[row] = flag;
      rowind[ *nnz ] = row;
      (*nnz)++;
    }
  }
  /* if skel_get_next TAUCS_ERROR==-1 it only means EOF; 
     otherwise, there is an error */
  if (row != TAUCS_ERROR) return row;
  else                    return TAUCS_SUCCESS;
}


/*
static void skel_get_postordercol(int* found, int flag,
				  int j,
				  int* nnz, int* rowind)
{
  int row;
  int i;
  int next;

  next = skel_get_next;
  while (skel_buffer[2*next + 1] < j && next < skel_buffer_ptr) {
    next++;
  }

  if (skel_buffer[2*next + 1] == j && next < skel_buffer_ptr) {
    if (next != skel_get_lastcol+1)
      taucs_printf("oocsp_colanalyze: internal error in get_postordercol\n");
  }

  *nnz = 0;
  for (i = next;
       skel_buffer[2*i + 1] == j && i < skel_buffer_ptr;
       i++) {
    row = skel_buffer[2*i];

    if (found[row] < flag) {
      found[row] = flag;
      rowind[ *nnz ] = row;
      (*nnz)++;
    }
  }

  skel_get_next = i;
  skel_get_lastcol = i - 1;
}
*/

/* UNION FIND ROUTINES */

static int uf_makeset(int* uf, int i)        { uf[i] = i; return i; }
static int uf_union  (int* uf, int s, int t) { uf[s] = t; return t; }
static int uf_find   (int* uf, int i)        { if (uf[i] != i) 
                                                 uf[i] = uf_find(uf,uf[i]); 
                                               return uf[i]; }

/* FILL STACK ROUTINES */

static int  stack_files = -1;
static char stack_basename[256];

/*
static int  stack_buffer_size;
static int* stack_buffer;
*/
static int  stack_buffer_ptr = -1;

static int  stack_top = -1;

static double stack_size = -1;
static double stack_max_size = -1;

static void stack_init(char* basename,
		       int* colptr, int* colstack, 
		       int ncols)
{
  int j;

  sprintf(stack_basename,"%s.fstack",basename);
  stack_files      = 0;
  stack_buffer_ptr = 0;
  stack_top        = -1;

  for (j=0; j<ncols; j++) colptr[j] = -1;

  /*
  stack_buffer_size = 1048576;
  stack_buffer = mxCalloc(stack_buffer_size,sizeof(int));
  */

  stack_size     = 0.0;
  stack_max_size = 0.0;
}

static void stack_finalize()
{
  if (stack_files != 0 || stack_buffer_ptr != 0)
    taucs_printf("oocsp_colanalyze: fill stack did not get empty\n");

  if (stack_allocated)
    taucs_free(stack_buffer);

  taucs_printf("oocsp_colanalyze: max stack size = %.0lf\n",stack_max_size);
}

static int stack_push(int* colptr, int* colstack, int i, int j)
{
  mode_t mode;
  mode_t perm;

  if (stack_top < 0 || colstack[stack_top] != j) {
    if (colptr[j] != -1) {
      taucs_printf("oocsp_colanalyze: fill stack internal error (push)\n");
    }
    stack_top++;
    colstack[stack_top] = j;
    colptr[j] = (stack_buffer_size*stack_files) + stack_buffer_ptr;
    if (colptr[j] > (INT_MAX/2))
      taucs_printf("oocsp_colanalyze: Warning! Pointers to fill stack may overflow\n");
  }
   
  stack_buffer[stack_buffer_ptr] = i;
  stack_buffer_ptr++;

  if (stack_buffer_ptr >= stack_buffer_size) {
    int     file;
    ssize_t io_size;
    char    fname[256];

    assert(0); /* debugging xxx */

    sprintf(fname,"%s.%d",stack_basename,stack_files);
    taucs_printf("oocsp_colanalyze: Writing out fill stack buffer <%s>\n",fname);
#ifdef OSTYPE_win32
    mode = _O_WRONLY | _O_CREAT | _O_BINARY;
    perm = _S_IREAD | _S_IWRITE | _S_IEXEC;
#else
    mode = O_WRONLY | O_CREAT;
    perm = 0644;
#endif
    file = open(fname,mode,perm);
    if (file == -1) {
      taucs_printf("oocsp_colanalyze: could not create stack file\n");
      return TAUCS_ERROR_IO;
    }
    io_size = write(file,stack_buffer,stack_buffer_size * sizeof(int));
    if (io_size != stack_buffer_size * sizeof(int)) {
      taucs_printf("oocsp_colanalyze: write to stack file failed\n");
      unlink(fname);
      return TAUCS_ERROR_IO;
    }
    close(file);

    stack_files++;
    stack_buffer_ptr = 0;
  }

  /*
  stack_size++;
  if (stack_size > stack_max_size) stack_max_size = stack_size;
  */
  return TAUCS_SUCCESS;
}

static int stack_pop(int* colptr, int* colstack, 
		      int* found, int flag,
		      int j, int* nnz, int* rowind)
{
  int row;
  int   i;
  mode_t mode;

  if (stack_top < 0 || colstack[stack_top] != j) { /* empty fill column */
    for (i=0; i<=stack_top; i++)
      if (colstack[i] == j) {
	taucs_printf("oocsp_colanalyze: fill stack internal error (pop)\n");
	return TAUCS_ERROR;
      }
    
    *nnz    = 0;
  } else {
    *nnz = 0;
    while (colptr[j] < (stack_buffer_size*stack_files) +stack_buffer_ptr) {

      if (stack_buffer_ptr == 0) {
	int     file;
	ssize_t io_size;
	char    fname[256];
	
	assert(0); /* debugging xxx */

	stack_files--;
	stack_buffer_ptr = stack_buffer_size;
	sprintf(fname,"%s.%d",stack_basename,stack_files);
	taucs_printf("oocsp_colanalyze: Reading a fill stack buffer <%s>\n",fname);
#ifdef OSTYPE_win32
	mode = _O_RDONLY | _O_BINARY;
#else
	mode = O_RDONLY;
#endif
	file = open(fname,mode);
	if (file == -1) {
	  taucs_printf("oocsp_colanalyze: could not open stack file\n");
	  return TAUCS_ERROR_IO;
	}
	io_size = read(file,stack_buffer,stack_buffer_size * sizeof(int));
	if (io_size != stack_buffer_size * sizeof(int)) {
	  taucs_printf("oocsp_colanalyze: read from stack file failed\n");
	  unlink(fname);
	  return TAUCS_ERROR_IO;
	}
	close(file);
	unlink(fname);
      }

      stack_buffer_ptr --;
      assert(stack_buffer_ptr >= 0);
      row = stack_buffer[ stack_buffer_ptr ];

      /*printf(">>> %08x \n",row);*/

      if (found[row] < flag) {
	found[row] = flag;
	rowind[ *nnz ] = row;
	(*nnz)++;
      }
    }

    stack_top--;
    colptr[j] = -1;
  }
  /*
  stack_size -= (double) (*nnz);
  */

  return TAUCS_SUCCESS;
}

/* MAIN ROUTINE */

static
int oocsp_colanalyze(taucs_ccs_matrix* matrix,
		      char* basename,
		      int*  colperm,
		      int** ptrparent,
		      int** ptrpostorder,
		      int** ptrlcolcount,
		      int** ptrucolcount)
{
  int errorcode = TAUCS_SUCCESS;

  int i,j,ip,p,jp;
  int nnz,cset,rset,rroot,fcol;
  
  int  postnum, depth;
  int* first_kid;
  int* next_kid;
  int* stack_vertex;
  int* stack_child;

  /*int* colptr;*/
  int* rowind;
  int* firstcol  = NULL;
  int* uf        = NULL;
  int* found;
  int* root      = NULL;
  int* stack_colptr;
  int* stack_colstk;
  int* tmp_col   = NULL;
  
  int* parent    = NULL;
  int* postorder = NULL;
  int* lcolcount = NULL;
  int* ucolcount = NULL;  
  
  int*    nrows;
  int*    ncols;

  taucs_printf("oocsp_colanalyze: In colanalyze\n");
  taucs_printf("oocsp_colanalyze: using %.0lf MBytes of memory\n",(remaining_memory)/1048576.0);
  /*EF_FILL=0x00;*/
  
  nrows     = &matrix->m;
  ncols     = &matrix->n;

  /* START THE ANALYSYS */

  if ((errorcode = skel_init(basename)) != TAUCS_SUCCESS)
    goto end;

  (remaining_memory) -= (double) ( 4 * (*ncols) * 4 /* sizeof(int32) */);

  /* +1 for stack_vertex, stack_child */
  parent          = (int*)taucs_malloc((*ncols+1)*sizeof(int));
  *ptrparent = parent;
  lcolcount       = (int*)taucs_malloc((*ncols+1)*sizeof(int));
  *ptrlcolcount = lcolcount;
  ucolcount       = (int*)taucs_malloc((*ncols+1)*sizeof(int));
  *ptrucolcount = ucolcount;
  postorder       = (int*)taucs_malloc((*ncols+1)*sizeof(int));
  *ptrpostorder = postorder;


  (remaining_memory) -= (double) ( 2 * ((*ncols)+1) * sizeof(int));
  (remaining_memory) -= (double) ( 2 * (*nrows) * sizeof(int));

  uf        = (int*)taucs_malloc((*ncols+1)*sizeof(int));
  root      = (int*)taucs_malloc((*ncols+1)*sizeof(int));
  firstcol  = (int*)taucs_malloc((*nrows)*sizeof(int));

  tmp_col   = (int*)taucs_malloc((*nrows)*sizeof(int));

  if (!parent || !lcolcount || !ucolcount || !postorder || 
      !uf || !root || !firstcol || !tmp_col) {
    errorcode = TAUCS_ERROR_NOMEM;
    goto end;
  }

  /* we can reuse the same space */
  first_kid = uf;
  next_kid  = root;

  found        = firstcol;
  stack_colptr = uf;
  stack_colstk = root;

  /* we use the output arrays before they are used. */
  stack_vertex = lcolcount; 
  stack_child  = ucolcount;

  for (i=0; i < (*nrows); i++) {
    firstcol[i] = (*ncols);
  }

  for (j=0; j < (*ncols); j++) {

#ifdef OLD
    {
      int   file;
      off_t offset;
      ssize_t io_size;
	
      nnz = clen[j];
      /* taucs_printf("oocsp_colanalyze: Debug_1: j= %d nnz= %d\n",j,nnz);*/
      file   = dfile_fid[ ind_fid[j] ];
      offset = ind_off[j];
      if (nnz != 0 && file != -1 && offset != -1) {
	if (lseek(file,offset,SEEK_SET) == -1) taucs_printf("oocsp_colanalyze: lseek failed\n");
	io_size = read(file, tmp_col, nnz * sizeof(int));
	if (io_size != nnz * sizeof(int)) taucs_printf("oocsp_colanalyze: Error reading data.\n");
      }
      rowind = tmp_col;

    }
#else
    /*
    nnz = clen[j];
    oocsp_readcol_structure(matrix,j,tmp_col);
    rowind = tmp_col;
    */

    /* new code: Sivan 28 Feb 2002 */
    nnz = (matrix->colptr)[colperm[j]+1] - (matrix->colptr)[colperm[j]];
    rowind = (matrix->rowind) + (matrix->colptr)[colperm[j]];
#endif

    cset       = uf_makeset(uf, j);
    root[cset] = j;
    parent[j]  = (*ncols);
    
    for (ip=0; ip<nnz; ip++) {
      
      i = rowind[ip];
      fcol = firstcol[i];
      if (fcol >= j) {
	firstcol[i] = j;
	fcol        = j;
      } else {
	rset = uf_find(uf,fcol);
	rroot = root[rset]; 
	if (rroot != j) {
	  parent[rroot] = j;
	  cset          = uf_union(uf,cset,rset);
	  root[cset]    = j;
	}
      }
      /* ADD (j,fcol) TO SKELETON */
      skel_add(j,fcol);
    }
    /*
    mxDestroyArray(output_args[0]);
    */
  }

  /* COMPUTE POSTORDER OF ETREE */

  /* create linked lists of children */
  for (j=0; j <= *ncols; j++) first_kid[j] = -1;
  for (j = (*ncols)-1; j >= 0; j--) {
    p              = parent[j];
    next_kid[j]    = first_kid[p];
    first_kid[p] = j;
  }

  
  /* do dfs in a loop */
  postnum = 0;
  depth = 0;
  stack_vertex[depth] = *ncols; /* root */
  stack_child [depth] = first_kid[ stack_vertex[depth] ];
  while (depth >= 0) {
    if ( stack_child[depth] != -1 ) {
      stack_vertex[depth+1] = stack_child[depth];
      stack_child [depth+1] = first_kid[  stack_vertex[depth+1] ];
      depth++;
    } else {
      if ( stack_vertex[depth] != (*ncols) ) { /* not root */
	if (stack_vertex[depth] >= *ncols) 
	  taucs_printf("oocsp_colanalyze: internal error in dfs (0)\n");
	postorder[ postnum ] = stack_vertex[depth];
	postnum++;
      }
      depth--;
      if (depth >= 0) /* sivan June 30, to avoid crash, seems to be right */
	stack_child[depth] = next_kid[  stack_child[depth] ];
    }
    if (depth > *ncols) {
      int i;
      taucs_printf("oocsp_colanalyze: depth=%d, ncols=%d\n",depth,*ncols);
      for (i=0; i<(*ncols); i++) {
	taucs_printf("oocsp_colanalyze: %d: [%d %d]\n",i,first_kid[i],next_kid[i]);
      }
      taucs_printf("oocsp_colanalyze: internal error in dfs (1)\n");
    }
  }

  
  if (postnum != *ncols) taucs_printf("oocsp_colanalyze: internal error in dfs (2)\n");
    
  /* SORT THE SKELETON MATRIX */

  if ((errorcode = skel_sort(postorder, *ncols, found /* temporary */)) != TAUCS_SUCCESS)
    goto end;
  
  /* SECOND PHASE, COMPUTE COLCOUNTS */

  /* we reuse the space of uf and root */
  stack_init(basename,stack_colptr,stack_colstk,*ncols); 

  for (i=0; i < (*nrows); i++) {
    found[i] = -1;
  }

  for (j=0; j<(*ncols); j++) {
    lcolcount[j] = 1;
    ucolcount[j] = 1;
  }

  for (jp=0; jp<(*ncols); jp++) {
    j = postorder[jp];
    found[j] = jp;
    p = parent[j];

    if (p < (*ncols)) {
      found[p] = jp;
      lcolcount[j]++;
      ucolcount[p]++;
    }

    if ((errorcode=stack_pop(stack_colptr,stack_colstk,
			     found, jp,
			     j,
			     &nnz,tmp_col)) != TAUCS_SUCCESS) goto end;
    rowind = tmp_col;
    for (ip=0; ip<nnz; ip++) {
      i = rowind[ip];

      lcolcount[j]++;
      ucolcount[i]++;
      if (p < *ncols) 
	if ((errorcode=stack_push(stack_colptr,stack_colstk,i,p)) != TAUCS_SUCCESS) goto end;
    }

    if ((errorcode=skel_get_postordercol(found,jp,
					 jp, /* use postorder column index */
					 &nnz,tmp_col)) != TAUCS_SUCCESS) goto end;
    rowind = tmp_col;
    for (ip=0; ip<nnz; ip++) {
      i = rowind[ip];

      lcolcount[j]++;
      ucolcount[i]++;

      if (lcolcount[j] > *(ncols))
	taucs_printf("oocsp_colanalyze: Internal error while producing lcolcounts\n");
      if (ucolcount[i] > *(ncols))
	taucs_printf("oocsp_colanalyze: Internal error while producing ucolcounts\n");

      if (p < *ncols) 
	if ((errorcode=stack_push(stack_colptr,stack_colstk,i,p)) != TAUCS_SUCCESS) goto end;
    }
  }

  /* MAKE AND POSTORDER PARENT 1-BASED AND MARK ROOTS WITH A ZERO */

  /*for (j=0; j < (*ncols); j++) {
    postorder[j]++;
    if (parent[j] == (*ncols))
      parent[j] = 0;
    else 
      parent[j]++;
      }  */
  /* fix up by Vladi */

  for (j=0; j < (*ncols); j++) {
    if (parent[j] == (*ncols))
      parent[j] = -1;
    /*lcolcount[j]--;
      ucolcount[i]--;*/
  }

  stack_finalize();

  /*
  mxDestroyArray(tmp1_array);
  mxDestroyArray(tmp2_array);
  mxDestroyArray(tmp3_array);
  mxDestroyArray(tmp4_array);
  */

end:

  if (errorcode != TAUCS_SUCCESS) {
    /* we must release arrays that were supposed to be returned */
    taucs_free(parent);
    taucs_free(lcolcount);
    taucs_free(ucolcount);
    taucs_free(postorder);
    *ptrparent = *ptrlcolcount = *ptrucolcount = *ptrpostorder = NULL;
  }

  taucs_free(uf);
  taucs_free(firstcol);
  taucs_free(root);
  taucs_free(tmp_col);

  (remaining_memory) += (double) ( 2 * ((*ncols)+1) * sizeof(int));
  (remaining_memory) += (double) ( 2 * (*nrows) * sizeof(int));

  skel_finalize();

  taucs_printf("oocsp_colanalyze: done\n");
  
  return errorcode;
}

/*********************************************************/
/* PANELIZATION                                          */
/*********************************************************/


/* There seems to be a confusion here between spawidth and remaining memory; sivan */

static
int oocsp_panelize_simple(
			   int  nrows,             /* input  */  
			   int  ncols,             /* input  */
			   int* postorder,         /* input  */
			   int* l_colcounts,       /* input  */
			   int* u_colcounts,       /* input  */
			   int* parents,           /* input  */

                           int* spawidth,          /* input  */  
                           int* maxsn,             /* input  */  
			   int** ptrpanels,        /* output */
			   int** ptrschedstart,    /* output */
			   int** ptrschedend,      /* output */
			   int** ptrfetchnext,     /* output */
			   int** ptrejectnext      /* output */
			   )

{
  int i,j,c;
  int panelsize, panelcols, panelnumber;
  int stop,eject,newpanel;
  int colcount;
  int* panels;
  int* schedstart;
  int* schedend;
  int* fetchnext;
  int* ejectnext;  
  double maxpanelsize;
  double memuse, width_multiplier;
  int  maxcolcount;

  maxcolcount = 0;
  for (j=0; j<ncols; j++)
    maxcolcount = max( maxcolcount, l_colcounts[j]+u_colcounts[j] );

  taucs_printf("oocsp_panelize: max col count = %d, nrows = %d\n",
	     maxcolcount, nrows);

  /*
    memory usage in numfact, width > 0, snodes:
           8*nrows*I + 2*nrows*D + 2*nrows*C + 3*nrows*P  +
           2*ncols*I + 
           1*w*nrows*I + 1*w*nrows*D + 1*w*I +
           5*w*maxc *I + 2*w*maxc *D +
           1*s*I +
           1*s*maxc *I + 2*s*maxc *D 

    we also account for the memory required for the in-core representation of A, L, U,
    which is 2*ncols*sizeof(char)+3*ncols*sizeof(int)
  */

  *maxsn = 8 - 4; /* 8 is the minimum value we use here */
  do {
    *maxsn += 4;
    memuse = 
      8.0*nrows*sizeof(int) + 2.0*nrows*sizeof(taucs_datatype) 
      + 2.0*nrows*sizeof(char) + 3.0*nrows*sizeof(void*)
      + 2.0*ncols*sizeof(int)
      + 1.0*(*maxsn)*sizeof(int)
      + 1.0*(*maxsn)*maxcolcount*sizeof(int) + 2.0*(*maxsn)*maxcolcount*sizeof(taucs_datatype);

    memuse +=
      3 * ( 3.0*ncols*sizeof(int) + 2.0*ncols*sizeof(char) );

    width_multiplier = 
      1.0*nrows*sizeof(int) + 1.0*nrows*sizeof(taucs_datatype) + 1.0*sizeof(int)
      + 5.0*maxcolcount*sizeof(int) + 2.0*maxcolcount*sizeof(taucs_datatype);
    *spawidth = (int) floor( (remaining_memory - memuse) / width_multiplier );
  } while (*spawidth > 4*(*maxsn));

  if (*spawidth < 8) *spawidth = 8; /* it might go over the limit */

  maxpanelsize = nrows * *spawidth; 

  taucs_printf("oocsp_panelize: spawidth = %d, max supernode = %d\n",*spawidth,*maxsn);

  panels         = (int*)taucs_malloc(ncols*sizeof(int)); (*ptrpanels)     = panels;
  schedstart     = (int*)taucs_malloc(ncols*sizeof(int)); (*ptrschedstart) = schedstart;
  schedend       = (int*)taucs_malloc(ncols*sizeof(int)); (*ptrschedend)   = schedend;
  fetchnext      = (int*)taucs_malloc(ncols*sizeof(int)); (*ptrfetchnext)  = fetchnext;
  ejectnext      = (int*)taucs_malloc(ncols*sizeof(int)); (*ptrejectnext)  = ejectnext;
  if (!panels || !schedstart || !schedend || !fetchnext || !ejectnext) {
    taucs_free(panels    );
    taucs_free(schedstart);
    taucs_free(schedend  );
    taucs_free(fetchnext );
    taucs_free(ejectnext );

    (*ptrpanels)     = NULL;
    (*ptrschedstart) = NULL; 
    (*ptrschedend)   = NULL; 
    (*ptrfetchnext)  = NULL; 
    (*ptrejectnext)  = NULL; 
    
    return TAUCS_ERROR_NOMEM;
  }
  
  panelnumber = 1;
  j           = 0;
  eject = 0;
  
  while (j<ncols)
  {
    panelsize = 0;
    panelcols = 0;
    stop = 0;

    while (stop == 0) 
      {
	c = postorder[j];
	colcount = l_colcounts[c] + u_colcounts[c]; /* was only L, don't remember why, sivan */
	newpanel = 0;
	if (*spawidth > 0)
	  {
	  if (panelcols >= *spawidth) 
	    newpanel=1; 
	  else
	    if (panelsize + colcount > maxpanelsize) newpanel=1; 
	  }

	if (newpanel == 0)
	  {
	    /* add c to panel */
	    panelsize = panelsize + colcount;
	    panelcols = panelcols + 1;
	    panels[c] = panelnumber;
	    ejectnext[j] = c;
	    schedstart[c] = panelnumber;
	    schedend[c]   = panelnumber;
	    j = j+1;
	    if (j >= ncols) stop = 1; 
	  } 
	else
	  {
	    panelnumber = panelnumber + 1;
	    stop = 1;
	  }
      }
    
    /* now reverse panel, compute fetchnext */
#ifdef VLADIMIR
    if(j<ncols)
      for (i=0;i<j-eject;i++)
	fetchnext[eject+i] = ejectnext[j-i];
    else {
      printf("j >= ncols???\n");
      assert(0);
      for (i=0;i<j-eject;i++)
	fetchnext[eject+i] = ejectnext[j-i-1];
    }
#else
    for (i=0;i<j-eject;i++)
      fetchnext[eject+i] = ejectnext[j-i-1];
#endif

    eject = j;
     
  }

  return TAUCS_SUCCESS;
}


/*********************************************************/
/* NUMERICAL PHASE                                       */
/*********************************************************/


#define SNODES 
#define SNODE_THRESHOLD 4
#define SNODE_BLOCK 8
#define SIMPLE_COL_COL_no
#define SPA_ONEARRAY
#define USE_BLAS
#define BLAS_THRESHOLD 10
#define BLOCK 16

#ifdef DETAILED_TIMING
static double flops_extra = 0.0;
static double flops_dense = 0.0;
#endif

/*
  Out-of-core sparse LU

  Numerical factorization.

  Memory management
    ipivots              nrows*I                            ! not freed, returned to caller
    rowlists             nrows*I + 3*width*maxlucols*I      ! heads,colind,next,prev
    heap                 nrows*I                            !                        

    spa                  nrows*I + nrows*D                  ! if spawidth < 0
    spa (width>0)        width*nrows*I + width*nrows*D      !

    lindices             nrows*C                            ! L or U 
    nnzmap               nrows*C                            ! ?

    panel                2*ncols*I                          ! id, nnz
                         + 3*ncols*P                        ! ind, inrowlist, re

    panel_compressed     sum_j lucols_j*(D+2*I), j=1:width  ! re, ind, inrowlist
                         <= width*maxlucols*(D+2*I)
                         
    snodes               2*ncols*I                          ! pivots, snode_index
                         +   MAX_SNODE*I                    ! pivrows
			 + 2*nrows*I                        ! ind, map
			 +   nrows*D                        ! re ???
			 +   MAX_SNODE*maxcolcount*I        ! lu_ind
			 +   MAX_SNODE*maxcolcount*D        ! lu_re

    snodes_dense         SNODE_MAX*maxcolcont*D             ! S
                         + width*maxcolcount*D              ! P
                         + width*I                          ! updcols

    no_snodes            nrows*I + nrows*D                  ! lu_ind, lu_re

    total: width > 0, snodes
           8*nrows*I + 2*nrows*D + 2*nrows*C + 3*nrows*P  +
           2*ncols*I + 
           1*w*nrows*I + 1*w*nrows*D + 1*w*I +
           5*w*maxc *I + 2*w*maxc *D +
           1*s*I +
           1*s*maxc *I + 2*s*maxc *D 

    total: width < 0, snodes
           8*nrows*I + 2*nrows*D + 2*nrows*C + 3*nrows*P  +
           2*ncols*I + 
           1*w*I +
           5*w*maxc *I + 2*w*maxc *D +
           1*s*I +
           1*s*maxc *I + 2*s*maxc *D 

    total: width > 0, no snodes
           7*nrows*I + 2*nrows*D + 2*nrows*C + 3*nrows*P  +
           1*w*nrows*I + 1*w*nrows*D +
           5*w*maxc *I + 1*w*maxc *D +

    total: width < 0, no snodes
           7*nrows*I + 2*nrows*D + 2*nrows*C + 3*nrows*P  +
           5*w*maxc *I + 1*w*maxc *D +

    where:
      I=sizeof(int) D=sizeof(taucs_datatype) C=sizeof(char) P=sizeof(void*)
      maxc=maxcolcount w=spawidth (=active panelwidth) s=max_snode
*/



static double time_total;

#ifdef DETAILED_TIMING
static double time_colcol;
static double time_colcol_1;
static double time_colcol_2;
static double time_factor;
static double time_scatter;
static double time_gather;
static double time_append;
static double time_read;
#ifdef SNODES
static double time_snode_tmp;
static double time_snode_1 = 0;
static double time_snode_2 = 0;
static double time_snode_21 = 0;
static double time_snode_3 = 0;
static double time_snode_4 = 0;
static double time_snode_detect;
static double time_snode_prepare;
static double time_snode_dense;
#endif

static double bytes_read;
static double bytes_appended;
static double col_ooc_updates;
static double col_read;

static double  flops;
static double  scatters;
static double  gathers;
static double  rowlist_ops;
static double  num_heap_ops;
#endif /* DETAILED_TIMING */

/****************************************************/
/*                                                  */
/* Heap operations                                  */
/*                                                  */
/****************************************************/

/* HEAP OPERATIONS */


static void num_heap_heapify(int* heap, int* heapsize, 
			     int* ipivots, int p) 
{
  int r,l,smallest;
  int temp;

#ifdef DETAILED_TIMING
  num_heap_ops += 1.0;
#endif

  r = (p+1) * 2;
  l = r - 1;

#ifdef INTERNAL_CHECKS
  if ((l-1)/2 != p || (r-1)/2 != p)
    taucs_printf("oocsp_numfact: left-right computation in heap failed\n");
#endif

  if (l < *heapsize && ipivots[heap[l]] < ipivots[heap[p]])
    smallest = l;
  else
    smallest = p;

  if (r < *heapsize && ipivots[heap[r]] < ipivots[heap[smallest]])
    smallest = r;
  
  if (smallest != p) {
    temp           = heap[p]; 
    heap[p]        = heap[smallest];
    heap[smallest] = temp;
    
    num_heap_heapify(heap, heapsize, ipivots, smallest);
  }
}

static void num_heap_insert(int* heap, int* heapsize, int* ipivots, int i)
{
  int child, parent;

  (*heapsize)++;

#ifdef DETAILED_TIMING
  num_heap_ops += 1.0;
#endif

  child = (*heapsize-1);
  parent = (child-1) / 2;
  while (child > 0 && (ipivots[heap[parent]] > ipivots[i])) {
    heap[child]   = heap[parent];
    child = parent;
    parent = (child-1) / 2;

#ifdef DETAILED_TIMING
    num_heap_ops += 1.0;
#endif
  }

  heap[child]   = i;
}

static int num_heap_extractmin(int* heap, int* heapsize, int* ipivots) 
{
  int m; 

#ifdef DETAILED_TIMING
  num_heap_ops += 1.0;
#endif

  if (*heapsize <= 0) return -1;
  
  m = heap[0];
  
  heap[0] = heap[(*heapsize)-1];

  (*heapsize)--;

  num_heap_heapify(heap,heapsize,ipivots,0);

  return m;
}

/****************************************************/
/*                                                  */
/* Row lists                                        */
/*                                                  */
/****************************************************/

static int* rowlists_head     = NULL;   /* one head per row */
static int* rowlists_colind   = NULL; 
static int* rowlists_next     = NULL;
static int* rowlists_prev     = NULL;
static int  rowlists_size     = -1;
static int  rowlists_freehead = -1;
/*static int  rowlists_freenext;*/

static void rowlists_finalize()
{
  taucs_free(rowlists_head);
  taucs_free(rowlists_colind);
  taucs_free(rowlists_next);
  taucs_free(rowlists_prev);

  rowlists_head   = NULL;
  rowlists_colind = NULL;
  rowlists_next   = NULL;
  rowlists_prev   = NULL;
}

static int rowlists_init(int size, int nrows)
{
  int i;

  rowlists_size = size;

  rowlists_head   = (int*)taucs_malloc(nrows*sizeof(int));
  rowlists_colind = (int*)taucs_malloc(rowlists_size*sizeof(int));
  rowlists_next   = (int*)taucs_malloc(rowlists_size*sizeof(int));
  rowlists_prev   = (int*)taucs_malloc(rowlists_size*sizeof(int));
  if (!rowlists_head || !rowlists_colind || !rowlists_next || !rowlists_prev) 
    return TAUCS_ERROR_NOMEM; 

  for (i=0; i<nrows; i++) rowlists_head[i] = -1;

  /* link the entire rowlist as one freelist */

  rowlists_freehead = 0;
  for (i=0; i<rowlists_size; i++) {
    rowlists_next[i] = i+1;
    /* freelist does not need prev pointers */ 
    /* rowlists_prev[i] = i-1; */ 
  }
  rowlists_next[ rowlists_size - 1 ] = -1;

  return TAUCS_SUCCESS; 
}

static int rowlists_insert(int row, int panelcol)
{
  int new;

#ifdef DETAILED_TIMING
  rowlist_ops += 1.0;
#endif /* DETAILED_TIMING */

  /* get memory from the freelist */

  if ((new = rowlists_freehead) == -1) {
    taucs_printf("oocsp_numfact: Out of rowlist memory\n");
    exit(1);
  }

  /* remove this memory from the freelist; freelist does now use prev */

  rowlists_freehead = rowlists_next[ new ];

  /* link to row list */

  rowlists_next[ new ] = rowlists_head[ row ];
  rowlists_prev[ new ] = -1;
  rowlists_colind[ new ] = panelcol;

  if (rowlists_next[new] != -1)
    rowlists_prev[ rowlists_next[new] ] = new;

  rowlists_head[ row ] = new;

  return new;
}

static void rowlists_delete(int row, int index)
{
#ifdef DETAILED_TIMING
  rowlist_ops += 1.0;
#endif /* DETAILED_TIMING */

  if (rowlists_head[ row ] == index)
    rowlists_head[ row ] = rowlists_next[ index ];

  if (rowlists_next[ index ] != -1)
    rowlists_prev[ rowlists_next[index] ] = rowlists_prev[ index ];

  if (rowlists_prev[ index ] != -1)
    rowlists_next[ rowlists_prev[index] ] = rowlists_next[ index ];

  rowlists_next[ index ] = rowlists_freehead;
  rowlists_freehead = index;
}

static int rowlists_isempty()
{
  int i,count;

  i = rowlists_freehead;
  count = 0;
  while (i != -1) {
    count++;
    i = rowlists_next[i];
  }

  if (count == rowlists_size) return 1;
  else return 0;
}
  
/****************************************************/
/*                                                  */
/* Spa routines                                     */
/*                                                  */
/****************************************************/

#if 0
static
int intcmp(const void* v1, const void* v2)
{
  const int* i1 = (const int*) v1;
  const int* i2 = (const int*) v2;

  if      (*i1 < *i2) { return -1; }
  else if (*i1 > *i2) { return  1; }
  else                  return  0;
}
#endif

static taucs_datatype*  spa    = NULL;
static char*            spamap = NULL;

static void spa_finalize()
{
  taucs_free(spa);
  taucs_free(spamap);

  spa    = NULL;
  spamap = NULL;
}

static int spa_init(int nrows)
{
  int i;

  spa    = (taucs_datatype*) taucs_malloc(nrows*sizeof(taucs_datatype));
  spamap = (char*)  taucs_malloc(nrows*sizeof(char));

  if (!spa || !spamap) return TAUCS_ERROR_NOMEM;

  for (i=0; i<nrows; i++) {spa[i] = taucs_zero; spamap[i] = 0;}
  
  return TAUCS_SUCCESS;
}

static void
gather(int             a_nnz,
       taucs_datatype* a_re,
       int*            a_ind,
       taucs_datatype* spa,
       char*           spamap)

{
  int i,ip;

#ifdef DETAILED_TIMING
  double time_tmp;

  gathers += ((double) a_nnz);
  time_tmp = taucs_wtime();
#endif

  for (ip=0; ip<a_nnz; ip++) {
    i = a_ind[ip];
    a_re[ip] = spa[i];
    spamap[i] = 0;

    spa[i] = taucs_zero;
  }

#ifdef DETAILED_TIMING
  time_gather += (taucs_wtime() - time_tmp);
#endif
}

static void 
scatter(int             a_nnz,
	taucs_datatype* a_re,
	int*            a_ind,
	taucs_datatype* spa,
	char*           spamap)
     
{
  int i,ip;

#ifdef DETAILED_TIMING
  double time_tmp;

  scatters += ((double) a_nnz);
  time_tmp = taucs_wtime();
#endif

  for (ip=0; ip<a_nnz; ip++) {
    i = a_ind[ip];
    spa[i]    = a_re[ip];
    spamap[i] = 1;
  }

#ifdef DETAILED_TIMING
  time_scatter += (taucs_wtime() - time_tmp);
#endif
}

/****************************************************/
/*                                                  */
/* Column updates                                   */
/*                                                  */
/****************************************************/

static
void spcol_spa_update(int pivotindex,
		      taucs_datatype* l_re,
		      int*    l_ind,
		      int     l_nnz,
		      int*    panelcols,
 		      int     panelcols_n,
		      int     nrows,
		      int**   a_inrowlist,
		      taucs_datatype* spa,
		      char*   spamap,
		      int**   a_ind,
		      int*    a_nnz)

{
  int i,ip,j,q;
  taucs_datatype v;

  for(j = 0; j < panelcols_n; j++) {
    q = panelcols[j];
    if (   taucs_re(spa[q*nrows + pivotindex]) == 0.0 
	&& taucs_im(spa[q*nrows + pivotindex]) == 0.0) continue;
    for (ip=0; ip<l_nnz; ip++) {
      i = l_ind[ip];
      v = l_re[ip];
      if (spamap[q*nrows + i] == 0) {
	spamap[q*nrows + i] = 1;
	spa   [q*nrows + i] = taucs_zero;
	a_ind      [q][ a_nnz[q] ] = i;
	a_inrowlist[q][ a_nnz[q] ] = rowlists_insert(i,q);
	(a_nnz[q])++;
      }
      /*spa[q*nrows + i] -= (spa[q*nrows + pivotindex] * v);*/
      spa[q*nrows + i] = taucs_sub(spa[q*nrows + i],
				   taucs_mul(spa[q*nrows + pivotindex] , v));
    }
  }
  return;
}

#ifdef SIMPLE_COL_COL
static void
spcol_spcol_update(int pivotindex,
		   taucs_datatype* l_re,
		   int*    l_ind,
		   int     l_nnz,
		   int     panelcol,
		   int*    a_inrowlist,
		   taucs_datatype* spa,
		   char*  spamap,
		   int*    a_ind,
		   int*    a_nnz,
		   int     lu_colcount)
{
  int i,ip;
  taucs_datatype pv;

  pv = spa[pivotindex];
  
  if (taucs_iszero(pv)) return;

#ifdef DETAILED_TIMING
  flops += 2.0 * ((double) l_nnz);
#endif /* DETAILED_TIMING */

  for (ip=0; ip<l_nnz; ip++) {
    i = l_ind[ip];
    if (spamap[i] == 0) {
      spamap[i] = 1;
      spa[i] = taucs_zero;
      a_ind[ *a_nnz ] = i;
      a_inrowlist[ *a_nnz ] = rowlists_insert(i,panelcol);
      (*a_nnz)++;
    }
    /*spa[i] -= (spa[pivotindex] * l_re[ip]);*/
    spa[i] = taucs_sub(spa[i],taucs_mul(spa[pivotindex] , l_re[ip]));
  }

  /*
  if ((*a_nnz) > lu_colcount) {
    taucs_printf("oocsp_numfact: prediction=%d, size now=%d\n",lu_colcount,*a_nnz);
    taucs_printf("oocsp_numfact: Column expands beyond prediction\n");
  }
  */

}
#else /* simple col col */

static int oocsp_spcol_n1 = 0;
static int oocsp_spcol_n2 = 0;

static void
spcol_spcol_update(int pivotindex,
			  taucs_datatype* l_re,
			  int*    l_ind,
			  int     l_nnz,
			  int     panelcol,
			  int*    a_inrowlist,
			  taucs_datatype* spa,
			  char*  spamap,
			  int*    a_ind,
			  int*    a_nnz,
			  int     lu_colcount)
{
  register int i;
  register int ip;
  register int ip_block;
  register int loop_bound;
  register char flag;
  register taucs_datatype pv;

  pv = spa[pivotindex];
  
  if (taucs_iszero(pv)) return;

#ifdef DETAILED_TIMING
  flops += 2.0 * ((double) l_nnz);
#endif /* DETAILED_TIMING */

  for (ip_block=0; ip_block<l_nnz; ip_block += BLOCK) {

    loop_bound = min(ip_block + BLOCK,l_nnz);

    flag = 1;
    oocsp_spcol_n1++;
    for (ip=ip_block; ip<loop_bound; ip++) {
      i = l_ind[ip];
      flag &= spamap[i];
      /* spa[i] -= (pv * l_re[ip]); */
      spa[i] = taucs_sub(spa[i], taucs_mul(pv , l_re[ip]));
    }

    if (!flag) {
      oocsp_spcol_n2++;

      for (ip=ip_block; ip<loop_bound; ip++) {
	i = l_ind[ip];
	if (spamap[i] == 0) {
	  spamap[i] = 1;
	  a_ind[ *a_nnz ] = i;
	  a_inrowlist[ *a_nnz ] = rowlists_insert(i,panelcol);
	  (*a_nnz)++;
	  /* we essentially zero and update */
	  /*spa[i] = - (pv * l_re[ip]);*/
	  spa[i] = taucs_neg(taucs_mul(pv , l_re[ip]));
	}
      }
    }
  }
  return;
}
#endif

#if 0
static void spcol_panel_update(int pivotindex,
			       taucs_datatype* l_re,
			       int*    l_ind,
			       int     l_nnz,
			       int*    subpanel, int subpanel_size, 
			       taucs_datatype* subpanel_tmp,
			       int**   a_inrowlist,
			       taucs_datatype* spa,
			       char*   spamap,
			       int**   a_ind,
			       int*    a_nnz,
			       int     nrows)
{
  int i,ii,ip,q,j;
  taucs_datatype x;

  assert(0);

#ifdef DETAILED_TIMING
  flops += ((double) subpanel_size) * 2.0 * ((double) l_nnz);
#endif /* DETAILED_TIMING */

  for (j=0; j<subpanel_size; j++) {
    q = subpanel[j];
    subpanel_tmp[q] = spa[q*nrows+pivotindex];
  }

  for (ip=0; ip<l_nnz; ip++) {
    i = l_ind[ip];
    x = l_re[ip];
    for (j=0; j<subpanel_size; j++) {
      q = subpanel[j];
      ii = (q*nrows) + i;
      if (spamap[ii] == 0) {
	spamap[ii] = 1;
	spa[ii] = taucs_zero;
	a_ind[q][ a_nnz[q] ] = i;
	a_inrowlist[q][ a_nnz[q] ] = rowlists_insert(i,q);
	(a_nnz[q])++;
      }
      /*spa[ii] -= (subpanel_tmp[q] * x);*/
      spa[ii] = taucs_sub(spa[ii], taucs_mul(subpanel_tmp[q] , x));
    }
  }
}
#endif

/****************************************************/
/*                                                  */
/* OLD STUFF                                        */
/*                                                  */
/****************************************************/

#if 0
void x_heap_insert( int* heap, int* heapsize,  int* ipivots, int value) 
{
  heap[ *heapsize ] = value;
  (*heapsize)++;

  taucs_printf("oocsp_numfact: heap insert %d\n",value);
}

int x_heap_extractmin( int* heap, int* heapsize, int* ipivots) 
{
  int i,m,mindex;

  if (*heapsize == 0) return -1;

  m = INT_MAX;

  for (i=0; i<(*heapsize); i++) 
    if (ipivots[heap[i]] < ipivots[m]) {m = heap[i]; mindex=i;}

  if (m == INT_MAX) return -1;

  for (i=mindex; i<(*heapsize)-1; i++) 
    heap[ i ] = heap[i+1];

  (*heapsize)--;
 
  taucs_printf("oocsp_numfact: heap extractmin %d (%d)\n",m,ipivots[m]);

  return m;
}
#endif

/****************************************************/
/*                                                  */
/* NUMERICAL FACTORIZATION MAIN ROUTINE             */
/*                                                  */
/****************************************************/

static
int  oocsp_numfact (taucs_ccs_matrix* A, int* colperm,
		    taucs_io_handle* LU,
		    int* panels,
		    int* schedstart,
		    int* schedend,
		    int* fetchnext,
		    int* ejectnext,
		    int* lcolcount,
		    int* ucolcount,
		    int spawidth,
		    int maxsn
		    )
{
  int errorcode = TAUCS_SUCCESS;

  int i,j,k,ip,p,q,qp,ii,ks;/* ip_next,jp_next,jp omer*/

  int nrows, ncols;

  
  /*char    fname[256]; 
  int     file; 
  int     mode; 
  mode_t  perm; omer*/

  /*int     len,status; omer*/

  /*
  double* pr;
  */

  /* NEW VARS (Sivan, for this function) */

  int fn;
  int en;
  int nsteps;
  int step;

  /*int nnz; omer*/
  /*off_t offset; omer*/
  /*ssize_t io_size; omer*/

  int  heapsize;
  int* heap = NULL;

  char* nnzmap = NULL;   /* bit vector */
  char* lindices = NULL; /* bit vector */
  /* char* uindices; */ /* bit vector */

  int* panel_id = NULL;
  int* panel_nnz = NULL;
  int**    panel_ind = NULL;
  int**    panel_inrowlist = NULL;
  taucs_datatype** panel_re = NULL;

  int* Lclen = NULL;
  int* Uclen = NULL;

  /*
  taucs_datatype* update_tmp;
  int* update_vec;
  int  update_vec_next;
  */

#ifdef SPA_ONEARRAY
  taucs_datatype*  panel_spa = NULL; /* warning */
  char*    panel_spamap = NULL; /* warning */
#else
  taucs_datatype**  panel_spa = NULL;
  char**    panel_spamap = NULL;
#endif

  int     unext,lnext;
  int*    lu_ind = NULL;
  taucs_datatype* lu_re = NULL;

  int* pivots = NULL;
  int* ipivots = NULL;

  int    maxind;
  double maxval, absval;
  int    pivotindex;

#ifdef SNODES
  int*    snode_ind = NULL;
  taucs_datatype* snode_re = NULL;
  /*char*   snode_map;*/ /* sivan changed to int to support m2 */
  int*    snode_map = NULL;
  int*    snode_pivrows = NULL;
  int*    snode_index = NULL;

  int     snode_hash=0; /* warning */
  int     snode_size, snode_flag, snode_nnz, snode_lastcol;
  int     snode_last;
  int     hash;
  int     snode_id;

  taucs_datatype* S = NULL;
  taucs_datatype* P = NULL;
  int*    spa_updcols = NULL;
  int     spa_n;
  int*    m2 = NULL;

  int*    srows = NULL;/* indices of rows in the supernode */
  int     srows_n;     /* number of rows in the supernode  */
  int     srow_next;

  int     dense_flag;
  int     tmp;
#endif

  int maxcolcount;
  
  /*double time_tmp; omer*/

  /* READ GLOBALS */

  taucs_printf("oocsp_numfact: Using %.0lf MBytes of memory\n",
	     remaining_memory/1048576.0);


  /* START THE FACTORIZATION */
  /*
  nrows = A->nrows;
  ncols = A->ncols;
  */
  nrows = A->m;
  ncols = A->n;

#ifdef DETAILED_TIMING
  flops       = 0.0;
  scatters    = 0.0;
  gathers     = 0.0;
  num_heap_ops    = 0.0;
  rowlist_ops = 0.0;
  time_append = 0.0;
  time_read   = 0.0;
  time_colcol = 0.0;
  time_colcol_1 = 0.0;
  time_colcol_2 = 0.0;
  time_factor = 0.0;
  time_scatter= 0.0;
  time_gather = 0.0;
#ifdef SNODES
  time_snode_detect  = 0.0;
  time_snode_prepare = 0.0;
  time_snode_dense   = 0.0;
#endif
  bytes_read  = 0.0;
  bytes_appended = 0.0;
  col_read    = 0.0;
  col_ooc_updates = 0.0;
#endif /* DETAILED_TIMING */

  time_total  = taucs_wtime();
  
  maxcolcount = 0;
  for (j=0; j<ncols; j++)
    maxcolcount = max( maxcolcount, lcolcount[j]+ucolcount[j] );

  taucs_printf("oocsp_numfact: nrows = %d, ncols = %d\n",
	     nrows,ncols);
  taucs_printf("oocsp_numfact: maxcolcount = %d, spawidth = %d, maxsn = %d\n",
	     maxcolcount, spawidth, maxsn);

  /*** MEMORY ALLOCATION ***/

  fn = 0;
  en = 0;

  pivots           = (int*)  taucs_malloc(ncols*sizeof(int));
  Lclen            = (int*)  taucs_calloc(ncols,sizeof(int));
  Uclen            = (int*)  taucs_calloc(ncols,sizeof(int));
  ipivots          = (int*)  taucs_malloc(ncols*sizeof(int));
  lindices         = (char*) taucs_malloc(nrows*sizeof(char));
  /* uindices = (char*)taucs_malloc(nrows*sizeof(char)); */
  nnzmap           = (char*) taucs_malloc(nrows*sizeof(char));
  heap             = (int*)  taucs_malloc(nrows*sizeof(int));

#ifdef SNODES
  /* the supernodes consists of contiguous columns and we need to know */
  /* the corresponding pivot rows */
  snode_lastcol = snode_nnz = snode_size = 0;
  snode_id      = 0;
  snode_last    = 0;

  taucs_printf("sizeof datatype = %d\n",sizeof(taucs_datatype));

  snode_pivrows = (int*)   taucs_malloc(maxsn*sizeof(int)); /* size was ncols */
  snode_ind     = (int*)   taucs_malloc(nrows*sizeof(int));
  snode_re      = (taucs_datatype*)taucs_malloc(nrows*sizeof(taucs_datatype));
  snode_map     = (int*)   taucs_malloc(nrows*sizeof(int));

  snode_index   = (int*)taucs_malloc(ncols*sizeof(int));

  S = (taucs_datatype*) taucs_malloc(maxcolcount * maxsn  * sizeof(taucs_datatype) );
  P = (taucs_datatype*) taucs_malloc(maxcolcount * spawidth   * sizeof(taucs_datatype) );
  srows       = (int*) taucs_malloc(maxcolcount * sizeof(int) );
  spa_updcols = (int*) taucs_malloc(spawidth * sizeof(int) );

  /*
  lu_re    = (taucs_datatype*)taucs_malloc(maxsn * nrows * sizeof(taucs_datatype) );
  lu_ind   = (int*)   taucs_malloc(maxsn * nrows * sizeof(int)    );
  */
  lu_re    = (taucs_datatype*)taucs_malloc(maxsn * maxcolcount * sizeof(taucs_datatype) );
  lu_ind   = (int*)   taucs_malloc(maxsn * maxcolcount * sizeof(int)    );
  /*
  taucs_printf("lu_re  = %08x -> %08x\n",lu_re,lu_re+(maxsn*maxcolcount));
  taucs_printf("lu_ind = %08x -> %08x\n",lu_ind,lu_ind+(maxsn*maxcolcount));
  */
#else /* SNODES */
  lu_re    = (taucs_datatype*)taucs_malloc(maxcolcount * sizeof(taucs_datatype));
  lu_ind   = (int*)   taucs_malloc(maxcolcount * sizeof(int));
  /*
  lu_re    = (taucs_datatype*)taucs_malloc(nrows*sizeof(taucs_datatype));
  lu_ind   = (int*)taucs_malloc(nrows*sizeof(int));
  */
#endif /* not SNODES */

  /* These two can be smaller */

  /*
  update_vec = (int*)taucs_malloc(ncols*sizeof(int));
  update_tmp = (taucs_datatype*)taucs_malloc(ncols*sizeof(taucs_datatype));
  */

  panel_id  = (int*)  taucs_malloc(ncols*sizeof(int));
  panel_nnz = (int*)  taucs_malloc(ncols*sizeof(int));
  panel_ind = (int**) taucs_calloc(ncols,sizeof(int*));
  panel_inrowlist = (int**) taucs_calloc(ncols,sizeof(int*));
  panel_re  = (taucs_datatype**)taucs_calloc(ncols,sizeof(taucs_datatype*));

  if (spawidth > 0) {
#ifdef SPA_ONEARRAY
    panel_spa = (taucs_datatype*)taucs_malloc(spawidth*nrows*sizeof(taucs_datatype));
    panel_spamap = (char*)taucs_malloc(spawidth*nrows*sizeof(char) );
#else
    panel_spa = (taucs_datatype**) taucs_calloc(spawidth,sizeof(taucs_datatype*));
    panel_spamap = (char**) taucs_calloc(spawidth,sizeof(char*) );
#endif
  } 

  if (!Lclen
      || !Uclen
      || !ipivots
      || !lindices
      || !nnzmap
      || !heap
      || !panel_id
      || !panel_nnz
      || !panel_ind
      || !panel_re
      || !panel_inrowlist
      || ((spawidth>0) && (!panel_spa || !panel_spamap))
      ) {

    taucs_printf("taucs: ooc LU out of memory in numerical factorization (1)\n");
    errorcode = TAUCS_ERROR_NOMEM;
    goto end;
  }

#ifdef SNODES
  if (!pivots
      || !snode_pivrows
      || !snode_ind
      || !snode_re
      || !snode_map
      || !snode_index
      || !S
      || !P
      || !srows
      || !spa_updcols
      || !lu_re
      || !lu_ind
      ) {
    taucs_printf("taucs: ooc LU out of memory in numerical factorization (2)\n");
    errorcode = TAUCS_ERROR_NOMEM;
    goto end;
  }
#else
  if (!pivots
      || !lu_re
      || !lu_ind
      ) {
    taucs_printf("taucs: ooc LU out of memory in numerical factorization (3)\n");
    errorcode = TAUCS_ERROR_NOMEM;
    goto end;
  }
#endif

  /*** MEMORY ALLOCATION DONE, NOW OTHER INITIALIZATIONS ***/
    
  heapsize = 0;
  for (i=0; i<nrows; i++) nnzmap[i] = 0;

  if ((errorcode=rowlists_init(maxcolcount * iabs(spawidth),nrows))!=TAUCS_SUCCESS) goto end; 
  
  for (i=0; i<ncols; i++) ipivots[i] = INT_MAX;

  for (i=0; i<nrows; i++) {
    lindices[i] = 1;
    /*uindices[i] = 0;*/
  }

  for (i=0; i<ncols; i++) panel_id[i] = -1;

#ifdef SNODES
  for (i=0; i<ncols; i++) pivots[i] = INT_MAX;
  for (i=0; i<ncols; i++) snode_index[i] = -1;
  for (i=0; i<nrows; i++) snode_map[i]   = -1;
#else
#endif

#ifdef SPA_ONEARRAY
#else
#endif

  if ((errorcode = spa_init( nrows )) != TAUCS_SUCCESS) goto end;

  nsteps=0; 
  for (i=0; i<ncols; i++) {
    if ((int) (schedstart[i]) > nsteps) nsteps = (int) schedstart[i];
  }

  /*** DONE INITIALIZING, NOW START THE FACTORZITION ***/

  taucs_printf("oocsp_numfact: Starting numerical factorization (%d steps)\n",nsteps);

  p = 0;

  for (step=1; step<=nsteps; step++) {

    taucs_printf("oocsp_numfact: Starting step %d/%d\r",step,nsteps);

    if (p==0) {
      /*taucs_printf("oocsp_numfact: (new panel)\n");*/
#ifdef SNODES
      snode_last = -1;
#endif

      if (!rowlists_isempty())
	taucs_printf("oocsp_numfact: Internal Error (row lists not empty)\n");
      if (heapsize) 
	taucs_printf("oocsp_numfact: Internal Error (heap not empty; 1)\n");
      for (i=0; i<nrows; i++) 
	if (nnzmap[i]) taucs_printf("oocsp_numfact: Internal Error (heap not empty; 2)\n");
    } else {
      taucs_printf("oocsp_numfact: (same panel)\n");
    }
    
    /* LOAD A PARTIAL PANEL */
    
    while (fn < ncols && 
            (schedstart[fetchnext[fn]]) == step) {
      j = fetchnext[fn];

      /*
      if (A->clen[j] > lcolcount[j]+ucolcount[j]) {
	taucs_printf("oocsp_numfact: Column %d in matrix larger than L+U estimate\n",j);
	taucs_printf("oocsp_numfact: Size %d > %d+%d\n",A->clen[j],lcolcount[j],ucolcount[j]); 
	taucs_printf("oocsp_numfact: Aborting\n");
      }

      panel_nnz[p] = A->clen[j];
      */

      /* New code: Sivan 28 Feb 2002 */

      panel_nnz[p] = (A->colptr)[colperm[j]+1] - (A->colptr)[colperm[j]];
      panel_id[p]  = j;
      if (lcolcount[j] < 1 
	  || lcolcount[j] > nrows
	  || ucolcount[j] < 1 
	  || ucolcount[j] > nrows) {
	taucs_printf("oocsp_numfact: Column %d: l,u colcounts = (%d+%d)\n",
		  j,lcolcount[j],ucolcount[j]);
	taucs_printf("oocsp_numfact: Invalid column nonzero count\n\n");
      }	

      panel_inrowlist[p] = (int*)    taucs_malloc((lcolcount[j] + ucolcount[j]) * sizeof(int));
      panel_ind[p]       = (int*)    taucs_malloc((lcolcount[j] + ucolcount[j]) * sizeof(int));
      panel_re[p]        = (taucs_datatype*) taucs_malloc((lcolcount[j] + ucolcount[j]) * sizeof(taucs_datatype));
      if (!panel_inrowlist[p] || 
	  !panel_ind[p] || 
	  !panel_re[p]) {
	fprintf(stderr,"out of memory for panel compressed vector\n");
	fprintf(stderr,"j = %d lcolcount = %d ucolcount = %d\n",
		j,lcolcount[j],ucolcount[j]);
	errorcode = TAUCS_ERROR_NOMEM;
	goto end;
      }
      /*taucs_printf("allocated panel_ind[%d] = %08x\n",p,panel_ind[p]);*/ /* xxx */

      if (spawidth > 0) {
#ifdef SPA_ONEARRAY
#else
	panel_spa[p]     = (taucs_datatype*) taucs_malloc(nrows * sizeof(taucs_datatype));
	panel_spamap[p]  = (char*)   taucs_malloc(nrows * sizeof(char));
	if (!panel_spa[p] || 
	    !panel_spamap[p]) {
	  fprintf(stderr,"out of memory for panel spa\n");
	  fprintf(stderr,"j = %d lcolcount = %d ucolcount = %d\n",
		  j,lcolcount[j],ucolcount[j]);
	  errorcode = TAUCS_ERROR_NOMEM;
	  goto end;
	}
#endif
      }

      /*
      time_tmp = taucs_wtime();
      oocsp_readcol(A,j,panel_ind[p],panel_re[p]);
      time_read += (taucs_wtime() - time_tmp);
      bytes_read += (double) (panel_nnz[p] * (sizeof(taucs_datatype)+sizeof(int)));
      */

      /* new code: Sivan 28 Feb 2002 */
      {
	int jp,ip,i;
	jp = colperm[j];
	for (i=0, ip=(A->colptr)[jp]; ip<(A->colptr)[jp+1]; i++, ip++) {
	  (panel_ind[p])[i] = (A->rowind)[ip];
	  (panel_re [p])[i] = (A->taucs_values)[ip];
	  assert( i < lcolcount[j] + ucolcount[j] );
	  /*
	  printf(">>> reading column number %d (index %d), row %d value %.2e\n",
		 j,jp,(panel_ind[p])[i],(panel_re[p])[i]);
	  */
	}
      }

      if (spawidth > 0) {
	if (p >= spawidth) {
	  taucs_printf("oocsp_numfact: p=%d spawidth=%lg\n",p,spawidth);
	  taucs_printf("oocsp_numfact: Panel wider than spawidth\n");
	  goto end;
	} 
	scatter(panel_nnz[p],panel_re[p],panel_ind[p],
#ifdef SPA_ONEARRAY
		panel_spa+(p * nrows),
		panel_spamap+(p * nrows)
#else
		panel_spa[p],
		panel_spamap[p]
#endif
		);
      }

      for (i=0; i<panel_nnz[p]; i++) {
	panel_inrowlist[p][i] = rowlists_insert(panel_ind[p][i],
						p);

	/*if (nnzmap[ panel_ind[p][i] ] == 0 && uindices[ panel_ind[p][i] ]) {*/
	if (nnzmap[ panel_ind[p][i] ] == 0 && !lindices[ panel_ind[p][i] ]) {
	  nnzmap[ panel_ind[p][i] ] = 1;
	  num_heap_insert( heap, &heapsize, ipivots, panel_ind[p][i] );
	}
      }

      p++;
      fn++;
    }

    /*taucs_printf("oocsp_numfact: Updating (step %d)\n",step);*/

#ifndef SNODES
    /* we need to prevent panel cols from being loaded! */

    while ((i = num_heap_extractmin( heap, &heapsize, ipivots)) != -1) {
      if (i == INT_MAX) continue;

      nnzmap[ i ] = 0;
      
      k = ipivots[i];
      if (k == INT_MAX) continue;
      
      /*taucs_printf("oocsp_numfact: extracted row %d (col %d) from heap. %d\n",i,k,uindices[i]);*/

      if (panels[k] == panels[ panel_id[0] ]) {
	/*taucs_printf("oocsp_numfact: skipping column from this panel (col %d, panel %lg)\n",
	       k,panels[k]);*/
	continue;
      }
      
      /*if (lindices[i]) continue;*/   /*if (!uindices[i]) continue;*/

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif
      /* Sivan: replaced 2 March 2002 */
      /*oocsp_readcol(L,k,lu_ind,lu_re);*/
      Lreadcol(LU,k,Lclen[k],lu_ind,lu_re);

#ifdef DETAILED_TIMING
      bytes_read += (double) (Lclen[k] * (sizeof(taucs_datatype)+sizeof(int)));
      col_read += 1.0;
#endif /* DETAILED_TIMING */

      /*
      for (ks = k+1; ks<ncols && snode_index[ks]==snode_index[k]; ks++) {
	oocsp_readcol(L,ks,lu_ind+((ks-k)*nrows),lu_re+((ks-k)*nrows));
	bytes_read += (double) ((L.clen[ks]) * (sizeof(taucs_datatype)+sizeof(int)));
#ifdef DETAILED_TIMING
	col_read += 1.0;
#endif
      }
      taucs_printf("oocsp_numfact: Read supernode, %d cols\n",ks-k+1);
      */

#ifdef DETAILED_TIMING
      time_read += (taucs_wtime() - time_tmp);
#endif


      /*
      taucs_printf("oocsp_numfact: col %d (panel %lg, now %lg; read from disk) for row %d, length %d\n",
	     k,panels[k],panels[panel_id[0]],i,(L.clen)[k]);
	     */

      if (Lclen[k] < 0) {
	taucs_printf("oocsp_numfact: k=%d len=%d\n",k,Lclen[k]);
	taucs_printf("oocsp_numfact: Internal Error\n");
      }

      for (ii=0; ii<Lclen[k]; ii++) {
	/*if (nnzmap[ lu_ind[ii] ] == 0 && uindices[ lu_ind[ii] ]*/
	if (nnzmap[ lu_ind[ii] ] == 0 && !lindices[ lu_ind[ii] ]
	    && !taucs_iszero(lu_re[ii])) {
	  nnzmap[ lu_ind[ii] ] = 1;
	  num_heap_insert( heap, &heapsize, ipivots, lu_ind[ii] );
	  /*taucs_printf("oocsp_numfact: inserting row %d into heap\n",lu_ind[ii]);*/
	}
      }

      for(qp = rowlists_head[i]; qp != -1; qp = rowlists_next[qp]) {
	q = rowlists_colind[qp];
#ifdef DETAILED_TIMING
	time_tmp = taucs_wtime();
#endif
	if (spawidth <= 0) {
	  scatter(panel_nnz[q],panel_re[q],panel_ind[q],
		  spa,spamap);
	  spcol_spcol_update(i,
			     lu_re,lu_ind,Lclen[k],
			     q,panel_inrowlist[q],
			     spa,
			     spamap,
			     panel_ind[q],&(panel_nnz[q]),
			     lcolcount[panel_id[q]]+ucolcount[panel_id[q]]);
	  gather(panel_nnz[q],panel_re[q],panel_ind[q],
		 spa,spamap);
	} else {
	  spcol_spcol_update(i,
			     lu_re,lu_ind,Lclen[k],
			     q,panel_inrowlist[q],
#ifdef SPA_ONEARRAY
			     panel_spa+(q * nrows),
			     panel_spamap+(q * nrows),
#else
			     panel_spa[q],
			     panel_spamap[q],
#endif
			     panel_ind[q],&(panel_nnz[q]),
			     lcolcount[panel_id[q]]+ucolcount[panel_id[q]]);
	}
#ifdef DETAILED_TIMING
	time_colcol += (taucs_wtime() - time_tmp);
	col_ooc_updates += 1.0;
#endif
      }

      /*
      update_vec_next = 0;
      for(qp = rowlists_head[i]; qp != -1; qp = rowlists_next[qp]) {
	q = rowlists_colind[qp];
	col_ooc_updates += 1.0;
	update_vec[update_vec_next] = q;
	update_vec_next++;
      }

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif

#ifdef SPA_ONEARRAY
      spcol_panel_update(i,
			 lu_re,lu_ind,Lclen[k],
			 update_vec,update_vec_next,update_tmp,
			 panel_inrowlist,
			 panel_spa,
			 panel_spamap,
			 panel_ind,panel_nnz,
			 nrows);
#endif

#ifdef DETAILED_TIMING
      time_colcol += (taucs_wtime() - time_tmp);
#endif
      */
    }
#else  /* with SNODES */
    /* we need to prevent panel cols from being loaded! */

    while ((i = num_heap_extractmin( heap, &heapsize, ipivots)) != -1) {
      if (i == INT_MAX) continue;

      nnzmap[ i ] = 0;
      
      k = ipivots[i];
      if (k == INT_MAX) continue;
      
      /*printf("last supernode %d this column is %d its supernode %d\n",snode_last,k,snode_index[k]);*/
      assert(k >= 0 && k < ncols); /* sivan oct 2003, chasing the win32 bug */

      if (snode_index[k] == snode_last) continue; /* skip rest of supernode */
      snode_last = snode_index[k]; /* mark for next time */
      
      /*taucs_printf("oocsp_numfact: extracted row %d (col %d) from heap. %d\n",i,k,uindices[i]);*/

      if (panels[k] == panels[ panel_id[0] ]) {
	/*taucs_printf("oocsp_numfact: skipping column from this panel (col %d, panel %lg)\n",
	  k,panels[k]);*/
	continue;
      }
      
      /*if (lindices[i]) continue;*/ /*if (!uindices[i]) continue;*/

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif

      for (ks = k; ks<ncols && snode_index[ks]==snode_index[k]; ks++) {
	/*printf(">>> %d %d\n",ks,snode_index[ks]);*/
	/*oocsp_readcol(L,ks,lu_ind+((ks-k)*maxcolcount),lu_re+((ks-k)*maxcolcount));*/
	Lreadcol(LU,ks,Lclen[ks],lu_ind+((ks-k)*maxcolcount),lu_re+((ks-k)*maxcolcount));
#ifdef DETAILED_TIMING
	bytes_read += (double) (Lclen[ks] * (sizeof(taucs_datatype)+sizeof(int)));
	col_read += 1.0;
#endif /* DETAILED_TIMING */
      }

#ifdef DETAILED_TIMING
      time_read += (taucs_wtime() - time_tmp);
#endif
      /*taucs_printf("oocsp_numfact: Read supernode, %d cols %d:%d\n",ks-k,k,ks-1);*/
      
      if (Lclen[k] < 0) {
	taucs_printf("oocsp_numfact: k=%d len=%d\n",k,Lclen[k]);
	taucs_printf("oocsp_numfact: Internal Error\n");
      }

      for (ii=0; ii<Lclen[k]; ii++) {
	/*if (nnzmap[ lu_ind[ii] ] == 0 && uindices[ lu_ind[ii] ]*/
	if (nnzmap[ lu_ind[ii] ] == 0 && !lindices[ lu_ind[ii] ]
	    && !taucs_iszero(lu_re[ii])) {
	  nnzmap[ lu_ind[ii] ] = 1;
	  num_heap_insert( heap, &heapsize, ipivots, lu_ind[ii] );
	  /*taucs_printf("oocsp_numfact: inserting row %d into heap\n",lu_ind[ii]);*/
	}
      }

      /* determine which panel cols need to be updates */
      dense_flag = 1;
      for (tmp = 0; tmp<1; tmp++) {
	int jj,ii,kk,iip,jjp;

	/*	printf("snode %d:%d\n",k,ks-1);*/

	if (ks-k < SNODE_THRESHOLD) {dense_flag=0; break;}

	assert(spawidth > 0); /* the code for the other case is not implemented yet; Sivan */

#ifdef DETAILED_TIMING
	time_tmp = taucs_wtime();
#endif

	srows_n = Lclen[k] + 1; /* the diagonal element in column k is not */
                                /* represented explicitely in L, it's 1    */

	if (srows_n < SNODE_THRESHOLD) {dense_flag=0; break;}

	m2 = snode_map; /* reuse the vector */

	/* create an array of row indices, with pivot rows first, in order */
	/* so that the supernode array will be a trapezoidal matrix        */
	/* we should keep this and restore the -1 invariant                */

#ifdef DETAILED_TIMING
	time_snode_tmp = taucs_wtime();
#endif

	for (jj=k; jj<ks; jj++) {
	  ii = pivots[jj];
	  srows[jj-k] = ii;
	  m2[ii] = 1;
	}
	/* now the rest of the rows */
	srow_next = ks-k;
	for (iip=0; iip<Lclen[k]; iip++) {
	  ii = lu_ind[iip];
	  if (ii == pivots[k]) srows_n--; /* I have no idea why this row shows up; always with 0.0 */
	  if (m2[ii] == -1) {
	    srows[srow_next] = ii;
	    srow_next++;
	  } else {
	    m2[ii] = -1; /* restore the invariant */
	  }
	}
	assert(srow_next == srows_n);
	
	/* we begin by figuring out which columns of the panel are updated    */
	/* by this supernode.                                                 */

#ifdef DETAILED_TIMING
	time_snode_tmp = taucs_wtime();
#endif

	spa_n = 0;
	for (jj=k; jj<ks; jj++) {
	  ii = pivots[jj];
	  for(qp = rowlists_head[ii]; qp != -1; qp = rowlists_next[qp]) {
	    int skip; /* don't add a column twice to spa_updcols */
	    q = rowlists_colind[qp];
	    for (skip=0, jjp=0; jjp<spa_n; jjp++) {
	      if (spa_updcols[jjp] == q) {
		skip = 1;
		break;
	      }
	    }
	    if (!skip) {
	      /*if (jj-k > 4) printf("*** jj-k %d ks-k %d\n",jj-k,ks-k);*/

#ifdef DETAILED_TIMING
	      flops += 2.0 * ( (ks-jj) * (srows_n - (jj-k)) - 0.5*(ks-jj)*(ks-jj) );
	      flops_extra += 2.0 * ( (jj-k) * (srows_n) - 0.5*(jj-k)*(jj-k) );
#endif /* DETAILED_TIMING */

	      spa_updcols[spa_n] = q;
	      /*spa_updptrs[spa_n] = jj-k;*/
	      spa_n++;
	    }
	  }
	}
#ifdef DETAILED_TIMING
	time_snode_1 += (taucs_wtime()-time_snode_tmp);
#endif

	if (spa_n < SNODE_THRESHOLD) {
	  for (jj=k; jj<ks; jj++) {
	    ii = pivots[jj];
	    m2[ii] = -1;
	  }
	  for (iip=0; iip<Lclen[k]; iip++) {
	    ii = lu_ind[iip];
	    m2[ii] = -1; /* restore the invariant */
	  }
	  dense_flag=0; 
	  break;
	}

	/*printf("snode %d:%d (dense)\n",k,ks-1);*/

	/* copy snode columns into a dense array */

	for (jj=k; jj<ks; jj++) {
	  for (iip=0; iip<Lclen[jj]; iip++)
	    /* mark in m2 where each row is */
	    m2[ lu_ind[(jj-k)*maxcolcount + iip] ] = iip;
	  
	  for (iip=0; iip<srows_n; iip++) {
	    if (m2[ srows[iip] ] != -1) {
	      S[ (jj-k)*srows_n + iip ] = lu_re[(jj-k)*maxcolcount + m2[ srows[iip] ]];
	      m2[ srows[iip] ] = -1; /* restore invariant for next column */
	    } else
	      S[ (jj-k)*srows_n + iip ] = taucs_zero;
	  }
	}
#ifdef DETAILED_TIMING
	time_snode_3 += (taucs_wtime()-time_snode_tmp);
#endif

	/* now the snode is stored in a dense array S, with row indices srows */
	/* and with the diagonal block of L on top.                           */

	/* next we copy the columns of the panels that need to be updated     */
	/* into the dense array P.                                            */

	/* we then copy these columns into the dense array P, and if          */
	/* fill occurs, we update the nonzero bitmap and row lists.           */

#ifdef DETAILED_TIMING
	time_snode_tmp = taucs_wtime();
#endif
#define OLD_1_no
#ifdef OLD_1
	for (jjp=0; jjp<spa_n; jjp++) {
	  jj = spa_updcols[jjp];

	  for (iip=0; iip<srows_n; iip++) {
	  /*for (iip=spa_updptrs[jjp]; iip<srows_n; iip++) {*/
	    if (panel_spamap[jj*nrows + srows[iip]] == 0) {
	      /* fill will occur here */
	      panel_spa[jj*nrows + srows[iip]] = taucs_zero;
	      /*P[jjp*srows_n + iip] = taucs_zero;*/
	      
	      panel_spamap   [jj*nrows + srows[iip]] = 1;
	      panel_ind      [jj][panel_nnz[jj] ] = srows[iip];
	      panel_inrowlist[jj][panel_nnz[jj] ] = rowlists_insert(srows[iip],jj);
	      (panel_nnz[jj])++;
	    }
#ifdef JUNK
	    } else { 
	      /* no fill, just copy the element */
  	      P[jjp*srows_n + iip] = panel_spa[jj*nrows + srows[iip]];
	    }
#endif
	  }
	} 

	{ 
#ifdef DETAILED_TIMING
	  double x = taucs_wtime();
#endif
	  for (jjp=0; jjp<spa_n; jjp++) {
	    jj = spa_updcols[jjp];
	    for (iip=0; iip<srows_n; iip++) {
	      P[jjp*srows_n + iip] = panel_spa[jj*nrows + srows[iip]];
	    }
	  }
#ifdef DETAILED_TIMING
	  time_snode_21 += (taucs_wtime()-x);
#endif
        }
#else /* OLD_1 */
	for (jjp=0; jjp<spa_n; jjp++) {
	  int iip_block, loop_bound, flag;
	  
	  jj = spa_updcols[jjp];

	  for (iip_block=0; iip_block<srows_n; iip_block += SNODE_BLOCK) {

	    loop_bound = min(iip_block + SNODE_BLOCK,srows_n);
	    
	    flag = 1;
	    for (iip=iip_block; iip<loop_bound; iip++)
	      flag &= panel_spamap[jj*nrows + srows[iip]];
	     
	    if (!flag) {
	      for (iip=iip_block; iip<loop_bound; iip++) {
		if (panel_spamap[jj*nrows + srows[iip]] == 0) {
		  panel_spamap   [jj*nrows + srows[iip]] = 1;
		  panel_spa      [jj*nrows + srows[iip]] = taucs_zero;
		  panel_ind      [jj][panel_nnz[jj] ] = srows[iip];
		  panel_inrowlist[jj][panel_nnz[jj] ] = rowlists_insert(srows[iip],jj);
		  (panel_nnz[jj])++;
		}
	      }
	    }
	  }
	  
	  { 
#ifdef DETAILED_TIMING
	    double x = taucs_wtime();
#endif
	    for (iip=0; iip<srows_n; iip++)
	      P[jjp*srows_n + iip] = panel_spa[jj*nrows + srows[iip]];
#ifdef DETAILED_TIMING
	    time_snode_21 += (taucs_wtime()-x);
#endif
	  }
	}
#endif /* OLD_1 */

#ifdef DETAILED_TIMING
	time_snode_2 += (taucs_wtime()-time_snode_tmp);
#endif

	/*printf("supernode update: col %d pivotrow %d updates col %d\n",jj,ii,panel_id[q]);*/
 
#ifdef DETAILED_TIMING
	time_snode_prepare += (taucs_wtime() - time_tmp);
#endif

#ifdef DETAILED_TIMING
	time_tmp = taucs_wtime();
#endif

	/*flops += (2.0 * spa_n * srows_n * (ks-k) - 2.0); */ /* over estimate; sivan. */
	/* we can subract triangle in estimate, skip zero pivots in flops & code */
#ifdef DETAILED_TIMING
	flops_dense += (2.0 * spa_n * srows_n * (ks-k) -
			1.0 * spa_n * (ks-k) * (ks-k)); 
#endif /* DETAILED_TIMING */

#ifdef JUNK
	printf("TRSM's RHS:\n");
	for (kk=k; kk<ks; kk++) {
	  for (jj=0; jj<spa_n; jj++) {
	    printf("%c", P[jj*srows_n + (kk-k)] ? '*' : 'o');
	  }
	  printf("\n");
	}
#endif

#ifdef USE_BLAS
	{ 
	  int m = ks-k;
	  int n = spa_n;
	  int M = srows_n - m;
	  int N = n;
	  int K = m;
	  
	  if (m > BLAS_THRESHOLD && n > BLAS_THRESHOLD) {
	    taucs_trsm("Left",
		       "Lower",
		       "No transpose",
		       "Unit",
		       &m,&n,
		       &taucs_one_const,
		       S,&srows_n,
		       P,&srows_n
		       );
	  } else {
	    /* TRSM */
	    for (jj=0; jj<spa_n; jj++) {
	      for (kk=k; kk<ks; kk++) {
		for (ii=kk-k+1; ii<(ks-k); ii++) {
		  /*P[jj*srows_n + ii] -= (P[jj*srows_n + (kk-k)] * S[(kk-k)*srows_n + ii]);*/
		  P[jj*srows_n + ii] = 
		  taucs_sub(P[jj*srows_n + ii],
			    taucs_mul(P[jj*srows_n + (kk-k)] , S[(kk-k)*srows_n + ii]));
		}
	      }
	    }
	  }

	  if (M > BLAS_THRESHOLD && N > BLAS_THRESHOLD && K > BLAS_THRESHOLD) {
	    taucs_gemm("No transpose",
		       "No transpose",
		       &M,&N,&K,
		       &taucs_minusone_const,
		       S+m, &srows_n,
		       P  , &srows_n,
		       &taucs_one_const,
		       P+m, &srows_n);
	  } else {
	    /* GEMM */
	    for (jj=0; jj<spa_n; jj++) {
	      for (kk=k; kk<ks; kk++) {
		for (ii=(ks-k); ii<srows_n; ii++) {
		  /*P[jj*srows_n + ii] -= (P[jj*srows_n + (kk-k)] * S[(kk-k)*srows_n + ii]);*/
		  P[jj*srows_n + ii] =
		    taucs_sub(P[jj*srows_n + ii],
			      taucs_mul(P[jj*srows_n + (kk-k)] , S[(kk-k)*srows_n + ii]));
		}
	      }
	    }
	  }
	}
#else	   
	/* TRSM */
	for (jj=0; jj<spa_n; jj++) {
	  for (kk=k; kk<ks; kk++) {
	    for (ii=kk-k+1; ii<(ks-k); ii++) {
	      P[jj*srows_n + ii] =
		taucs_sub(P[jj*srows_n + ii],
			  taucs_mul(P[jj*srows_n + (kk-k)] , S[(kk-k)*srows_n + ii]));
	      /*P[jj*srows_n + ii] -= (P[jj*srows_n + (kk-k)] * S[(kk-k)*srows_n + ii]);*/
	    }
	  }
	}

	/* GEMM */
	for (jj=0; jj<spa_n; jj++) {
	  for (kk=k; kk<ks; kk++) {
	    for (ii=(ks-k); ii<srows_n; ii++) {
	      P[jj*srows_n + ii] =
		taucs_sub(P[jj*srows_n + ii],
			  taucs_mul(P[jj*srows_n + (kk-k)] , S[(kk-k)*srows_n + ii]));

	      /*P[jj*srows_n + ii] -= (P[jj*srows_n + (kk-k)] * S[(kk-k)*srows_n + ii]);*/
	    }
	  }
	}
#endif
#ifdef DETAILED_TIMING
	time_snode_dense += (taucs_wtime() - time_tmp);
	col_ooc_updates += 1.0;
#endif

#ifdef DETAILED_TIMING
	time_tmp = taucs_wtime();
#endif

	/* now copy panel columns out of the dense P */

#ifdef DETAILED_TIMING
	time_snode_tmp = taucs_wtime();
#endif
	for (jjp=0; jjp<spa_n; jjp++) {
	  jj = spa_updcols[jjp];
	  for (iip=0; iip<srows_n; iip++) {
	    /*for (iip=spa_updptrs[jjp]; iip<srows_n; iip++) {*/
	    panel_spa[jj*nrows + srows[iip] ] = P[jjp*srows_n + iip];
	  }
	}
#ifdef DETAILED_TIMING
	time_snode_4 += (taucs_wtime()-time_snode_tmp);
        time_snode_prepare += (taucs_wtime() - time_tmp);
#endif
      }
      if (!dense_flag) { /* we didn't do it using the blas since m,n, or k were too small */
	/* not worth copying into dense arrays etc */
	/* determine which panel cols need to be updates */
	int jj,ii;
	for (jj=k; jj<ks; jj++) {
	  ii = pivots[jj];
	  if (ii == INT_MAX) {
	    taucs_printf("oocsp_numfact: internal error (supernode update)\n");
	    exit(1);
	  }
	  for(qp = rowlists_head[ii]; qp != -1; qp = rowlists_next[qp]) {
	    q = rowlists_colind[qp];
	    /*printf("supernode update: col %d pivotrow %d updates col %d\n",jj,ii,panel_id[q]);*/

#ifdef DETAILED_TIMING
	    time_tmp = taucs_wtime();
#endif
	    if (spawidth <= 0) {
	      taucs_printf("oocsp_numfact: internal error (supernode without a spa)\n");
	      exit(1);
	    } else {
	      spcol_spcol_update(ii,
				 lu_re+((jj-k)*maxcolcount),lu_ind+((jj-k)*maxcolcount),Lclen[jj],
				 q,panel_inrowlist[q],
#ifdef SPA_ONEARRAY
				 panel_spa+(q * nrows),
				 panel_spamap+(q * nrows),
#else
				 panel_spa[q],
				 panel_spamap[q],
#endif
				 panel_ind[q],&(panel_nnz[q]),
				 lcolcount[panel_id[q]]+ucolcount[panel_id[q]]);
	    }
#ifdef DETAILED_TIMING
	    time_colcol   += (taucs_wtime() - time_tmp);
	    time_colcol_1 += (taucs_wtime() - time_tmp);
	    col_ooc_updates += 1.0;
#endif
	  }
	}
      }
    }

#endif /* else SNODES */

    
    /*taucs_printf("oocsp_numfact: Factoring and Writing (step %d)\n",step);*/

    while (en < ncols && 
            (schedend[ejectnext[en]]) == step) {
      p--;
      j = ejectnext[en];

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif
      
      if (panel_id[p] < 0)  taucs_printf("oocsp_numfact: internal error (panel stack)\n");
      if (panel_id[p] != j) {
	taucs_printf("oocsp_numfact: en=%d p=%d, j=%d, panel_id[p]=%d\n",
	       en,p,j,panel_id[p]);
	taucs_printf("oocsp_numfact: internal error (panel id's)\n");
      }

      /* gather */

      if (spawidth > 0) {
	gather(panel_nnz[p],panel_re[p],panel_ind[p],
#ifdef SPA_ONEARRAY
	       panel_spa + (p * nrows),
	       panel_spamap + (p * nrows));
#else
	       panel_spa[p],
	       panel_spamap[p]);
#endif
      }

      /* Find pivot */
      
      maxval = 0;
      maxind = -1;
      for (ip = 0; ip<panel_nnz[p]; ip++) {
	absval = (double) taucs_abs(panel_re[p][ip]);
	if (!lindices[panel_ind[p][ip]]) continue;
	if (absval > maxval) {
	  maxval = absval;
	  maxind = ip;
	}
      }
      
      if ( maxind == -1 ) {
	taucs_printf("oocsp_numfact: Zero Column (1)\n");
	errorcode = TAUCS_ERROR_SINGULAR;
	goto end;
      }
      
      if ( taucs_iszero(panel_re[p][maxind]) ) {
	taucs_printf("oocsp_numfact: Zero Pivot (2)\n");
	errorcode = TAUCS_ERROR_SINGULAR;
	goto end;
      }
      
      pivotindex = panel_ind[p][maxind];

      /*
      taucs_printf("oocsp_numfact: pivot for column %d is %d (%lg)\n",j,pivotindex,
	     panel_re[p][maxind]);
	     */

      if ( ipivots[ pivotindex ] != INT_MAX ) {
	taucs_printf("oocsp_numfact: Pivoting twice on the same row\n");
	errorcode = TAUCS_ERROR; /* internal error, this should not happen */
	goto end;
      }
      
      ipivots[ pivotindex ] = j;
#ifdef SNODES
      pivots[ j ] = pivotindex;
#endif
      /*printf("### pivot row for column %d is %d\n",j,pivotindex);*/

      lindices[ pivotindex ] = 0;
      /*uindices[ pivotindex ] = 1;*/
      
      /* copy to L, U */
      
      /*taucs_printf("oocsp_numfact: copying to l,u\n");*/
      
      lnext = 0;
      /* unext = nrows-1; */
      unext = maxcolcount-1;
      for (ip = 0; ip<panel_nnz[p]; ip++) {
	i = panel_ind[p][ip];
	/*if (uindices[i]) {*/
	if (!lindices[i]) {
	  lu_re[unext]  = panel_re[p][ip];
	  lu_ind[unext] = i;
	  /*assert( unext > 0 && unext < maxsn*maxcolcount);*/
	  unext--;
	} else {
	  /*lu_re[lnext]  = (panel_re[p][ip])/(panel_re[p][maxind]);*/
	  lu_re[lnext]  = taucs_div(panel_re[p][ip] , panel_re[p][maxind]);
	  lu_ind[lnext] = i;
	  lnext++;
	}	  
      }

#ifdef DETAILED_TIMING
      flops += (double) lnext;
#endif /* DETAILED_TIMING */
      
#ifdef DETAILED_TIMING
      time_factor += (taucs_wtime() - time_tmp);
#endif

      /* Write out column of L, U */
      /*
      taucs_printf("oocsp_numfact: writing column %d of l,u (sizes %d %d)\n",
	     j,lnext,nrows - 1 - unext);
	     */

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif
      /*
      oocsp_appendcol(U,j,nrows - 1 - unext,
		      lu_ind + (unext+1),
		      lu_re  + (unext+1));
      */

      /* Sivan: replaced this code on 2 March 2002 */
      /*
      oocsp_appendcol(U,j,maxcolcount - 1 - unext,
		      lu_ind + (unext+1),
		      lu_re  + (unext+1));
      */
      assert(Uclen[j] == 0);
      Uclen[j] = maxcolcount - 1 - unext;
      if ((errorcode=Uappendcol(LU,j,maxcolcount - 1 - unext,lu_ind + (unext+1),lu_re  + (unext+1)))!=TAUCS_SUCCESS) goto end;
#ifdef DETAILED_TIMING
      time_append += (taucs_wtime() - time_tmp);
#endif

#ifdef DETAILED_TIMING
      bytes_appended += (double) ((maxcolcount - 1 - unext) 
				  * (sizeof(taucs_datatype)+sizeof(int)));
#endif /* DETAILED_TIMING */

#ifdef SNODES
      /* detect supernodes */

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif
      
      snode_flag = 1; /* we assume so for now */

      /* a quick check using a hash function to quickly rule out columns */

      if (j>0 && panels[j] != panels[j-1]) snode_flag = 0;

      hash = pivotindex;
      for (ii=0; ii<snode_size; ii++)
	hash ^= snode_pivrows[ii];
      for (ii=0; ii<lnext; ii++)
	hash ^= lu_ind[ii];

      if (hash == snode_hash) {
	/* The hash is identical, but is it in the same supernode? */

	/* first, mark this column's structure in the snode_map    */
	/* then make sure all the supernode's nonzeros are marked  */

	/* for (ii=0; ii<nrows; ii++) assert( snode_map[ii] == -1 ); */

	assert(pivotindex < nrows);
	snode_map[pivotindex       ] = 1;
	for (ii=0; ii<lnext; ii++)      {snode_map[lu_ind[ii]       ] = 1; assert(lu_ind[ii] < nrows);}
	for (ii=0; ii<snode_size; ii++) {snode_map[snode_pivrows[ii]] = 1; assert(snode_pivrows[ii] < nrows);}
	for (ii=0; ii<snode_nnz; ii++) {
	  assert(ii < nrows);
	  if (snode_map[snode_ind[ii]] != 1) snode_flag = 0;
	}

	/* next, zero this column's structure in the snode_map     */
	/* then make sure all the supernode's nonzeros are marked  */


    	                                snode_map[pivotindex]        = -1;
	for (ii=0; ii<lnext; ii++)	snode_map[lu_ind[ii]       ] = -1;
	for (ii=0; ii<snode_size; ii++) snode_map[snode_pivrows[ii]] = -1;
	
	/* mark this supernodes nonzeros in the map                */
	
	for (ii=0; ii<snode_nnz; ii++) {snode_map[snode_ind[ii]] = 1; assert(snode_ind[ii] < nrows);}

	/* check that all the column's nonzeros are in the snode   */

	for (ii=0; ii<lnext; ii++) if (snode_map[lu_ind[ii]] != 1) snode_flag = 0;

	/* zero the bitmap for next time                           */

	for (ii=0; ii<snode_nnz; ii++) snode_map[snode_ind[ii]] = -1;

	/* for (ii=0; ii<nrows; ii++) assert( snode_map[ii] == -1 ); */

      } else
	snode_flag = 0;

      if (snode_size >= maxsn) snode_flag = 0;

      if (j != snode_lastcol+1) snode_flag = 0;

      if (snode_flag) {

	/* THIS COLUMN BELONGS TO AN EXISTING SUPERNODE */
	
	snode_pivrows[snode_size] = pivotindex;
	snode_size++;

	for (ii=0; ii<snode_size; ii++)
	  spa[snode_pivrows[ii]] = taucs_zero;
	for (ii=0; ii<lnext; ii++)
	  spa[lu_ind[ii]] = lu_re[ii];

	for (ii=0; ii<snode_nnz; ii++) {
	  assert(ii < nrows);
	  snode_re[ii] = spa[snode_ind[ii]];
	}

#ifdef DETAILED_TIMING
	time_snode_detect += (taucs_wtime() - time_tmp);
	time_tmp = taucs_wtime();
#endif

	/* Sivan: replaced 2 March 2002 */
	/* oocsp_appendcol(L,j,snode_nnz,snode_ind,snode_re);*/
	assert(Lclen[j] == 0);
	Lclen[j] = snode_nnz;
	if ((errorcode=Lappendcol(LU,j,snode_nnz,snode_ind,snode_re))!=TAUCS_SUCCESS) goto end;

#ifdef DETAILED_TIMING
	time_append += (taucs_wtime() - time_tmp);
	bytes_appended += (double) (snode_nnz
				    * (sizeof(taucs_datatype)+sizeof(int)));
#endif

	/*taucs_printf("oocsp_numfact: supernode size = %d (column %d)\n",snode_size,j);*/
      } else {
	/* THIS COLUMN BELONGS TO A NEW SUPERNODE */

	snode_id ++;

	/*taucs_printf("oocsp_numfact: new supernode, column %d row %d\n",j,pivotindex);*/

#ifdef DETAILED_TIMING
	time_snode_detect += (taucs_wtime() - time_tmp);
	time_tmp = taucs_wtime();
#endif
      
	/*oocsp_appendcol(L,j,lnext,lu_ind,lu_re);*/
	assert(Lclen[j] == 0);
	Lclen[j] = lnext;
	if ((errorcode=Lappendcol(LU,j,lnext,lu_ind,lu_re))!=TAUCS_SUCCESS) goto end;

#ifdef DETAILED_TIMING
	time_append += (taucs_wtime() - time_tmp);
	bytes_appended += (double) (lnext 
				    * (sizeof(taucs_datatype)+sizeof(int)));
	time_tmp = taucs_wtime();
#endif

	snode_hash = pivotindex;
	for (ii=0; ii<lnext; ii++) {
	  assert(ii < nrows);
	  snode_hash ^= lu_ind[ii];
	  snode_ind[ii] = lu_ind[ii];
	}
	snode_pivrows[0] = pivotindex;
	snode_nnz = lnext;
	snode_size = 1;

#ifdef DETAILED_TIMING
	time_snode_detect += (taucs_wtime() - time_tmp);
#endif
      }

      snode_index[j] = snode_id;

      snode_lastcol = j;

#else /* SNODES */

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif
      /*oocsp_appendcol(L,j,lnext,lu_ind,lu_re);*/
      assert(Lclen[j] == 0);
      Lclen[j] = lnext;
      if ((errorcode=Lappendcol(LU,j,lnext,lu_ind,lu_re))!=TAUCS_SUCCESS) goto end;
#ifdef DETAILED_TIMING
      time_append += (taucs_wtime() - time_tmp);
      bytes_appended += (double) (lnext 
				  * (sizeof(taucs_datatype)+sizeof(int)));
#endif /* DETAILED_TIMING */

#endif /* SNODES */
     
      /* Update rest of panel */

      /*taucs_printf("oocsp_numfact: updating\n");*/

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif
      if (spawidth <= 0) {
	for(qp = rowlists_head[pivotindex]; qp != -1; qp = rowlists_next[qp]) {
	  q = rowlists_colind[qp];
	  scatter(panel_nnz[q],panel_re[q],panel_ind[q],
		  spa,spamap);
	  spcol_spcol_update(pivotindex,
			     lu_re,lu_ind,lnext,
			     q,panel_inrowlist[q],
			     spa,
			     spamap,
			     panel_ind[q],&(panel_nnz[q]),
			     lcolcount[panel_id[q]]+ucolcount[panel_id[q]]);
	  gather(panel_nnz[q],panel_re[q],panel_ind[q],
		 spa,spamap);
	}
      } else {
	int spa_n = 0;
	for(qp = rowlists_head[pivotindex]; qp != -1; qp = rowlists_next[qp]) {
	  q = rowlists_colind[qp];
	  /* sivan 20 oct 2003: I am not sure the ifdef is right */
#ifdef SNODES
	  spa_updcols[spa_n] = q;
#endif
	  spa_n++;
	}
	/*
	for(qp = 0; qp < spa_n; qp++) {
	  q = spa_updcols[qp];
	  spcol_spcol_update(pivotindex,
			     lu_re,lu_ind,lnext,
			     q,
			     panel_inrowlist[q],
			     panel_spa    + (q*nrows),
			     panel_spamap + (q*nrows),
			     panel_ind[q],&(panel_nnz[q]),
			     0);
	}
	*/

	spcol_spa_update(pivotindex,
			 lu_re,lu_ind,lnext,
			 spa_updcols,spa_n,nrows,
			 panel_inrowlist,
			 panel_spa,
			 panel_spamap,
			 panel_ind,panel_nnz);
      }
#ifdef DETAILED_TIMING
      time_colcol   += (taucs_wtime() - time_tmp);
      time_colcol_2 += (taucs_wtime() - time_tmp);
#endif

      /*taucs_printf("oocsp_numfact: done updating\n");*/

      /* Eject from panel */

      for (ip = 0; ip<panel_nnz[p]; ip++) {
	i = panel_ind[p][ip];
	rowlists_delete(i,panel_inrowlist[p][ip]);
      }

      /*taucs_printf("freeing panel_ind[%d] = %08x\n",p,panel_ind[p]);*/

      taucs_free(panel_inrowlist[p]); panel_inrowlist[p] = NULL;
      taucs_free(panel_ind[p]);       panel_ind[p] = NULL;
      taucs_free(panel_re[p]);        panel_re[p]  = NULL;
      if (spawidth > 0) {
#ifndef SPA_ONEARRAY
        taucs_free(panel_spa[p]);     anel_spa[p] = NULL;
        taucs_free(panel_spamap[p]);  anel_spamap[p]  = NULL;
#endif
      }
      panel_id[p] = -1;

      /*taucs_printf("oocsp_numfact: done with col\n");*/

      en++;
    }    
  }

#ifdef SNODESHIST
#define HIST_SIZE 32
#define HIST_INC  4
  {
    int i,j,k,s,last;
    int hist[HIST_SIZE];
    for (i=0; i<HIST_SIZE; i++) hist[i] = 0;

    last = 0;
    s = 0;
    for (j=0; j<ncols; j++) {
      if (snode_index[j] == last) {
	s++;
      } else {
	i = s / HIST_INC;
	if (i > HIST_SIZE-1) i=HIST_SIZE-1;
	(hist[i])++;
	/*printf("last snode index %d size %d hist %d\n",last,s,i);*/
	last = snode_index[j];
	s = 1;
      }
    }
    /*printf("last snode index %d size %d\n",last,s);*/
    i = s / HIST_INC;
    if (i > HIST_SIZE-1) i=HIST_SIZE-1;
    (hist[i])++;
    taucs_printf("oocsp_numfact: snode histogram:\n");
    for (i=0; i<HIST_SIZE-1; i++)
      taucs_printf("oocsp_numfact:   %02d-%02d: %d snodes\n",i*HIST_INC+1,(i+1)*HIST_INC,hist[i]);
    taucs_printf("oocsp_numfact:   %02d&up: %d snodes:\n",
	       (HIST_SIZE-1)*HIST_INC+1,hist[HIST_SIZE-1]);
  }
#endif

  if (((errorcode=taucs_io_append(LU, HEADER_NROWS  , 1,      1, TAUCS_INT, &(A->m)    ))) != TAUCS_SUCCESS) goto end;
  if (((errorcode=taucs_io_append(LU, HEADER_NCOLS  , 1,      1, TAUCS_INT, &(A->n)    ))) != TAUCS_SUCCESS) goto end;
  if (((errorcode=taucs_io_append(LU, HEADER_FLAGS  , 1,      1, TAUCS_INT, &(A->flags)))) != TAUCS_SUCCESS) goto end;
  if (((errorcode=taucs_io_append(LU, HEADER_COLPERM, (A->n), 1, TAUCS_INT, colperm    ))) != TAUCS_SUCCESS) goto end;
  if (((errorcode=taucs_io_append(LU, HEADER_IPIVOTS, (A->n), 1, TAUCS_INT, ipivots    ))) != TAUCS_SUCCESS) goto end;
  if (((errorcode=taucs_io_append(LU, HEADER_LCLEN  , (A->n), 1, TAUCS_INT, Lclen      ))) != TAUCS_SUCCESS) goto end;
  if (((errorcode=taucs_io_append(LU, HEADER_UCLEN  , (A->n), 1, TAUCS_INT, Uclen      ))) != TAUCS_SUCCESS) goto end;

  (remaining_memory) += (double) ( 2 * (ncols+1) * sizeof(int));

  {
    double nnzL = 0;
    double nnzU = 0;
    for (j=0; j < (A->n); j++) nnzL += (double) (Lclen[j]);
    for (j=0; j < (A->n); j++) nnzU += (double) (Uclen[j]);
    taucs_printf("oocsp_numfact: nnz(L) = %.2le nnz(U) = %.2le nnz(L+U) = %.2leM\n",
	       nnzL,nnzU,(nnzL+nnzU)/1e6);
  }
  
end:

  if (spawidth > 0) {
#ifdef SPA_ONEARRAY

#else
    for (p=0; p<spawidth; p++) {
      taucs_free(panel_spa[p]);
      taucs_free(panel_spamap[p]);
    }
#endif
    taucs_free(panel_spa);
    taucs_free(panel_spamap);
  }

  spa_finalize();

  rowlists_finalize();

  taucs_free(ipivots);

  taucs_free(Uclen);
  taucs_free(Lclen);

  for (p=0; p<ncols; p++) {
    taucs_free(panel_re[p]);
    taucs_free(panel_ind[p]);
    taucs_free(panel_inrowlist[p]);
  }

  taucs_free(panel_re);
  taucs_free(panel_ind);
  taucs_free(panel_inrowlist);
  taucs_free(panel_nnz);
  taucs_free(panel_id);

/*
  taucs_free(update_vec);
  taucs_free(update_tmp);
*/

#ifdef SNODES
  taucs_free(snode_index);
  taucs_free(snode_pivrows);
  taucs_free(snode_ind);
  taucs_free(snode_re);
  taucs_free(snode_map);

  taucs_free(pivots);

  taucs_free(spa_updcols);
  taucs_free(P);
  taucs_free(S);
  taucs_free(srows);
#endif

  taucs_free(heap);
  taucs_free(nnzmap);
  taucs_free(lindices);
  /*taucs_free(uindices);*/
  taucs_free(lu_re);
  taucs_free(lu_ind);

  /*taucs_free(pivots);
  taucs_free(ipivots);
  */

  /* OLD */

  time_total = taucs_wtime() - time_total;
  taucs_printf("oocsp_numfact: %lg sec total\n",time_total);

#ifdef DETAILED_TIMING
  taucs_printf("oocsp_numfact: %lg extra flops, %2.0lf %\n",flops_extra,100.0*flops_extra/flops);
  taucs_printf("oocsp_numfact: %lg dense flops, %2.0lf %\n",flops_dense,100.0*flops_dense/flops);
  taucs_printf("oocsp_numfact: %lg Mflop dense/s\n",(flops_dense*1e-6)/(time_snode_dense));
 
  taucs_printf("oocsp_numfact: %lg flops\n",flops);
  taucs_printf("oocsp_numfact: %lg scatter ops\n",scatters);
  taucs_printf("oocsp_numfact: %lg gather  ops\n",gathers);
  taucs_printf("oocsp_numfact: %lg heap    ops\n",num_heap_ops);
  taucs_printf("oocsp_numfact: %lg rowlist ops\n",rowlist_ops);
  taucs_printf("oocsp_numfact: %lg sec col/col ops\n",time_colcol);
  taucs_printf("oocsp_numfact: %lg sec col/col ops (%lg+%lg=%lg)\n",time_colcol,
	     time_colcol_1,time_colcol_2,(time_colcol_1+time_colcol_2));
  taucs_printf("oocsp_numfact: %lg sec column factor ops\n",time_factor);
  taucs_printf("oocsp_numfact: %lg sec scatter ops\n",time_scatter);
  taucs_printf("oocsp_numfact: %lg sec gather  ops\n",time_gather);
  taucs_printf("oocsp_numfact: %lg sec io read\n",time_read);
  taucs_printf("oocsp_numfact: %lg sec io write\n",time_append);
#ifdef SNODES
  taucs_printf("oocsp_numfact: %lg sec snode preparations for dense ops\n",time_snode_prepare);
  taucs_printf("oocsp_numfact: %lg sec snode dense ops\n",time_snode_dense);
  taucs_printf("oocsp_numfact: %lg sec snode detection\n",time_snode_detect);
  taucs_printf("oocsp_numfact: %lg sec snode 1\n",time_snode_1);
  taucs_printf("oocsp_numfact: %lg sec snode 2 (%lg)\n",time_snode_2,
		                                      time_snode_21);
  taucs_printf("oocsp_numfact: %lg sec snode 3\n",time_snode_3);
  taucs_printf("oocsp_numfact: %lg sec snode 4\n",time_snode_4);
#endif
  taucs_printf("oocsp_numfact: \n");
  taucs_printf("oocsp_numfact: %lg Mflop/s\n",(flops*1e-6)/(time_total));
  taucs_printf("oocsp_numfact: %lg MB/s IO read\n",(bytes_read*1e-6)/(time_read));
  taucs_printf("oocsp_numfact: %lg MB/s IO write\n",(bytes_appended*1e-6)/(time_append));
  taucs_printf("oocsp_numfact: %lg col reuse\n",col_ooc_updates/col_read);
  taucs_printf("oocsp_numfact: %lg percent IO\n",(time_read+time_append)/time_total);
  taucs_printf("oocsp_numfact: %lg percent col/col\n",(time_colcol)/time_total);
#endif


#ifndef SIMPLE_COL_COL
  {
    extern int oocsp_spcol_n1,oocsp_spcol_n2;
    taucs_printf("oocsp_numfact: spcol counts %d %d\n",oocsp_spcol_n1,oocsp_spcol_n2);
  }
#endif

  return errorcode;
}


/*********************************************************/
/* FACTOR                                                */
/*********************************************************/

static 
int
oocsp_factor(taucs_ccs_matrix* A_in,
      taucs_io_handle* LU,
      int*  colperm)
{
  /*int i,j; omer*/
  int  rc;

  int* etree=NULL;
  int* postorder=NULL;
  int* l_colcounts=NULL;
  int* u_colcounts=NULL;
  int* panels=NULL;
  int* schedstart=NULL;
  int* schedend=NULL;
  int* fetchnext=NULL;
  int* ejectnext=NULL;
  int  spawidth;
  int  maxsn;

/*
  EF_PROTECT_BELOW=0;
  EF_PROTECT_FREE=0;
  EF_FILL=0x0;
  EF_ALIGNMENT=1;
  taucs_printf("EF align=%d below=%d fill=%d free=%d\n malloc0=%d\n",
	       EF_ALIGNMENT,EF_PROTECT_BELOW,EF_FILL,EF_PROTECT_FREE,EF_ALLOW_MALLOC_0);
*/

  taucs_printf("taucs_ooc_lu: starting\n");

  taucs_printf("taucs_ooc_lu: calling colanalyze\n");
  rc = oocsp_colanalyze(A_in,
	   	        taucs_io_get_basename(LU),
		        colperm,&etree,&postorder,&l_colcounts,&u_colcounts);
  if (rc != TAUCS_SUCCESS) return rc;

  taucs_printf("taucs_ooc_lu: calling panelize\n");
  rc = oocsp_panelize_simple(A_in->m,A_in->n,
	  	             postorder,
                             l_colcounts,u_colcounts,etree,
			     &spawidth,&maxsn,
			     &panels,&schedstart,&schedend,&fetchnext,&ejectnext);
  /* we don't need these any more */
  taucs_free(postorder);
  taucs_free(etree);
  if (rc != TAUCS_SUCCESS) return rc;

  taucs_printf("taucs_ooc_lu: calling numfact\n");
  rc = oocsp_numfact(A_in,colperm,
	  	     /*L,U,*/
		     LU,
		     panels,schedstart,schedend,fetchnext,ejectnext,
		     l_colcounts,
		     u_colcounts,
                     spawidth,maxsn);
  taucs_free(panels);
  taucs_free(schedstart);
  taucs_free(schedend);
  taucs_free(fetchnext);
  taucs_free(ejectnext);
  if (rc != TAUCS_SUCCESS) return rc;

  taucs_printf("taucs_ooc_lu: done, returning\n");

  return TAUCS_SUCCESS;
}

int
taucs_dtl(ooc_factor_lu)(taucs_ccs_matrix* A_in,
		         int    colperm[],
                         taucs_io_handle* LU,
		         double memory)
{
  remaining_memory = memory;
  taucs_printf("taucs_ooc_factor_lu: using %.0lf MBytes of in-core memory\n",
	     (remaining_memory)/1048576.0);
  return oocsp_factor(A_in,LU,colperm);
}

/*********************************************************/
/* SOLVE                                                 */
/*********************************************************/

static int
oocsp_solve(taucs_io_handle* LU,
	    taucs_datatype* X,
	    taucs_datatype* B)
{
  int errorcode = TAUCS_SUCCESS;

  int i,ip,j,n;
  taucs_datatype* Y      = NULL;
  taucs_datatype  Aij;
  taucs_datatype* values = NULL;
  int*    indices        = NULL;
  int*    irowperm       = NULL;
  int*    Lclen          = NULL;
  int*    Uclen          = NULL;
  int*    colperm        = NULL;
  int*    ipivots        = NULL;

  int     found;
  
  double  time_solve = taucs_wtime();
  double  bytes_read = 0;

  taucs_printf("oocsp_solve: starting\n");

  /* n = L->nrows; */
  if ((errorcode=taucs_io_read(LU, HEADER_NROWS, 1, 1, TAUCS_INT, &n)) != TAUCS_SUCCESS) 
    goto end;

  Y       = (taucs_datatype*) taucs_malloc(n * sizeof(taucs_datatype));
  values  = (taucs_datatype*) taucs_malloc(n * sizeof(taucs_datatype));
  indices = (int*)    taucs_malloc(n * sizeof(int));
  irowperm= (int*)    taucs_malloc(n * sizeof(int));

  Lclen   = (int*)    taucs_malloc(n * sizeof(int));
  Uclen   = (int*)    taucs_malloc(n * sizeof(int));

  colperm = (int*)    taucs_malloc(n * sizeof(int));
  ipivots = (int*)    taucs_malloc(n * sizeof(int));

  if ( !(Y && values && indices && irowperm 
	 && Lclen && Uclen && colperm && ipivots) ) {
    taucs_printf("taucs: Out-of-Core LU solve failed due to lack of memory\n");
    errorcode = TAUCS_ERROR_NOMEM;
    goto end;
  }

  if ((errorcode=taucs_io_read(LU, HEADER_LCLEN, n, 1, TAUCS_INT, Lclen)) != TAUCS_SUCCESS) goto end;
  if ((errorcode=taucs_io_read(LU, HEADER_UCLEN, n, 1, TAUCS_INT, Uclen)) != TAUCS_SUCCESS) goto end;

  if ((errorcode=taucs_io_read(LU, HEADER_COLPERM, n, 1, TAUCS_INT, colperm)) != TAUCS_SUCCESS) goto end;
  if ((errorcode=taucs_io_read(LU, HEADER_IPIVOTS, n, 1, TAUCS_INT, ipivots)) != TAUCS_SUCCESS) goto end;

  for(i=0; i<n; i++)
     irowperm[ipivots[i]]=i;

  /*
  taucs_printf("colperm = [");
  for(i=0; i<n; i++)
    taucs_printf("%d, ",colperm[i]);
  taucs_printf("\b\b]\n");

  taucs_printf("ipivots = [");
  for(i=0; i<n; i++)
    taucs_printf("%d, ",ipivots[i]);
  taucs_printf("\b\b]\n");
  */
  

  /* start by permuting B, X=PB */

  /*
  for (i=0; i<n; i++)
    PB[i] = B[ ipivots[i] ];
  */

  for (i=0; i<n; i++)
    Y[i] = B[i];

  for (j=0; j<n; j++) {
    /*oocsp_readcol(L,j,indices,values);*/
    if ((errorcode=Lreadcol(LU,j,Lclen[j],indices,values)) != TAUCS_SUCCESS) goto end;
    bytes_read += Lclen[j] * (sizeof(int) + sizeof(taucs_datatype));
    for (ip=0; ip < Lclen[j]; ip++) {
      i = indices[ip];
      Aij = values[ip];
      /*Y[i] = Y[i] - Aij*Y[irowperm[j]];*/
      Y[i] = taucs_sub(Y[i] , taucs_mul( Aij , Y[irowperm[j]] ));
    }
  }

  for (i=0; i<n; i++) X[i] = Y[i];

  for (j=n-1; j>=0; j--) {
    /*oocsp_readcol(U,j,indices,values);*/
    if ( (errorcode=Ureadcol(LU,j,Uclen[j],indices,values)) != TAUCS_SUCCESS) goto end;
    bytes_read += Uclen[j] * (sizeof(int) + sizeof(taucs_datatype));

    found = 0;
    for (ip=0; ip < Uclen[j]; ip++) {
      i = indices[ip];
      if (i == irowperm[j]) {
	found = 1;
	Aij = values[ip];
	/*X[i] = X[i] / Aij;*/
	X[i] = taucs_div( X[i] , Aij );
	values[ip] = taucs_zero; /* so we don't multiply in the daxpy below */
      }
    }
    assert( found );

    for (ip=0; ip < Uclen[j]; ip++) {
      i = indices[ip];
      Aij = values[ip];
      /*X[i] = X[i] - Aij*X[irowperm[j]];*/
      X[i] = taucs_sub( X[i] , taucs_mul( Aij , X[irowperm[j]] ));
    }
  }


  for (i=0; i<n; i++) Y[i] = X[i];
  for (i=0; i<n; i++)
    X[ ipivots[i] ] = Y[ i ];


  for (i=0; i<n; i++) Y[i] = X[i];
  for (i=0; i<n; i++)
    X[ colperm[i] ] = Y[ i ];

  /*
    X[ colperm[i] ] = Y[ i ];

  for (i=0; i<n; i++) Y[i] = X[i];
  for (i=0; i<n; i++)
    X[ ipivots[i] ] = Y[ i ];

  */

  /*
  for (i=0; i<n; i++) if (colperm[i] == 0) j=i;
  printf("rowperm[0]=%d irowperm[0]=%d colperm[0]=%d icp[0]=%d\n",ipivots[0], irowperm[0], colperm[0],j);
  for (i=0; i<n; i++) if (colperm[i] == 1) j=i;
  printf("rowperm[1]=%d irowperm[1]=%d colperm[1]=%d icp[1]=%d\n",ipivots[1], irowperm[1], colperm[1],j);
  */

  time_solve = (taucs_wtime() - time_solve);
  taucs_printf("oocsp_solve: done in %.0lf seconds, read %.0lf bytes (%.0lf MBytes)\n",
	     time_solve,bytes_read,bytes_read/1048576.0);

end:

  taucs_free(Y);
  taucs_free(values);
  taucs_free(indices);
  taucs_free(irowperm);
  taucs_free(Uclen);
  taucs_free(Lclen);
  taucs_free(ipivots);
  taucs_free(colperm);

  return errorcode; /* success */
} /* main */

int taucs_dtl(ooc_solve_lu)(taucs_io_handle*   LU,
			    taucs_datatype* x, taucs_datatype* b)
{
  return oocsp_solve(LU,x,b);
}

#endif /*#ifndef TAUCS_CORE_GENERAL*/

/*********************************************************/
/* USER CALLABLE ROUTINES                                */
/*********************************************************/

#ifdef TAUCS_CORE_GENERAL

int taucs_ooc_factor_lu(taucs_ccs_matrix* A,
			int*              colperm,
                        taucs_io_handle*  LU,                       
			double            memory)
{
#ifdef TAUCS_CONFIG_DREAL
  if (A->flags & TAUCS_DOUBLE) {
    return taucs_dooc_factor_lu(A,colperm,LU,memory);
  }
#endif

#ifdef TAUCS_CONFIG_DCOMPLEX
  if (A->flags & TAUCS_DCOMPLEX) {
    return taucs_zooc_factor_lu(A,colperm,LU,memory);
  }
#endif

#ifdef TAUCS_CONFIG_SREAL
  if (A->flags & TAUCS_SINGLE) {
    return taucs_sooc_factor_lu(A,colperm,LU,memory);
  }
#endif

#ifdef TAUCS_CONFIG_SCOMPLEX
  if (A->flags & TAUCS_SCOMPLEX) {
    return taucs_cooc_factor_lu(A,colperm,LU,memory);
  }
#endif

  return TAUCS_ERROR_DATATYPE;
}

int taucs_ooc_solve_lu (void* vLU,
			void* x, void* b)
{
  int errorcode;
  taucs_io_handle* LU = (taucs_io_handle*) vLU;
  int flags;

  if ((errorcode=taucs_io_read(LU, HEADER_FLAGS, 1, 1, TAUCS_INT, &flags)) != TAUCS_SUCCESS) 
    return errorcode;

  printf("taucs_ooc_solve_lu: starting, DZSC=%d%d%d%d\n",
	 (flags & TAUCS_DOUBLE  ) != 0,
	 (flags & TAUCS_DCOMPLEX) != 0,
	 (flags & TAUCS_SINGLE  ) != 0,
	 (flags & TAUCS_SCOMPLEX) != 0);
  
#ifdef TAUCS_CONFIG_DREAL
  if (flags & TAUCS_DOUBLE) {
    return taucs_dooc_solve_lu(LU,x,b);
  }
#endif

#ifdef TAUCS_CONFIG_DCOMPLEX
  if (flags & TAUCS_DCOMPLEX) {
    return taucs_zooc_solve_lu(LU,x,b);
  }
#endif

#ifdef TAUCS_CONFIG_SREAL
  if (flags & TAUCS_SINGLE) {
    return taucs_sooc_solve_lu(LU,x,b);
  }
#endif

#ifdef TAUCS_CONFIG_SCOMPLEX
  if (flags & TAUCS_SCOMPLEX) {
    return taucs_cooc_solve_lu(LU,x,b);
  }
#endif

  return TAUCS_ERROR_DATATYPE;
}

#endif /*#ifdef TAUCS_CORE_GENERAL*/

/*********************************************************/
/* END OF FILE                                           */
/*********************************************************/
